net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135 static int redirect_genid;
 136
 137 static struct delayed_work expires_work;
 138 static unsigned long expires_ljiffies;
 139
 140 /*
 141  *      Interface to generic destination cache.
 142  */
 143
 144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 145 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 146 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 147 static void              ipv4_dst_destroy(struct dst_entry *dst);
 148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 149 static void              ipv4_link_failure(struct sk_buff *skb);
 150 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 151 static int rt_garbage_collect(struct dst_ops *ops);
 152
 153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 154                             int how)
 155 {
 156 }
 157
 158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 159 {
 160         struct rtable *rt = (struct rtable *) dst;
 161         struct inet_peer *peer;
 162         u32 *p = NULL;
 163
 164         if (!rt->peer)
 165                 rt_bind_peer(rt, rt->rt_dst, 1);
 166
 167         peer = rt->peer;
 168         if (peer) {
 169                 u32 *old_p = __DST_METRICS_PTR(old);
 170                 unsigned long prev, new;
 171
 172                 p = peer->metrics;
 173                 if (inet_metrics_new(peer))
 174                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 175
 176                 new = (unsigned long) p;
 177                 prev = cmpxchg(&dst->_metrics, old, new);
 178
 179                 if (prev != old) {
 180                         p = __DST_METRICS_PTR(prev);
 181                         if (prev & DST_METRICS_READ_ONLY)
 182                                 p = NULL;
 183                 } else {
 184                         if (rt->fi) {
 185                                 fib_info_put(rt->fi);
 186                                 rt->fi = NULL;
 187                         }
 188                 }
 189         }
 190         return p;
 191 }
 192
 193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 194
 195 static struct dst_ops ipv4_dst_ops = {
 196         .family =               AF_INET,
 197         .protocol =             cpu_to_be16(ETH_P_IP),
 198         .gc =                   rt_garbage_collect,
 199         .check =                ipv4_dst_check,
 200         .default_advmss =       ipv4_default_advmss,
 201         .mtu =                  ipv4_mtu,
 202         .cow_metrics =          ipv4_cow_metrics,
 203         .destroy =              ipv4_dst_destroy,
 204         .ifdown =               ipv4_dst_ifdown,
 205         .negative_advice =      ipv4_negative_advice,
 206         .link_failure =         ipv4_link_failure,
 207         .update_pmtu =          ip_rt_update_pmtu,
 208         .local_out =            __ip_local_out,
 209         .neigh_lookup =         ipv4_neigh_lookup,
 210 };
 211
 212 #define ECN_OR_COST(class)      TC_PRIO_##class
 213
 214 const __u8 ip_tos2prio[16] = {
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BESTEFFORT,
 218         ECN_OR_COST(BESTEFFORT),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_BULK,
 222         ECN_OR_COST(BULK),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE,
 226         ECN_OR_COST(INTERACTIVE),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK),
 229         TC_PRIO_INTERACTIVE_BULK,
 230         ECN_OR_COST(INTERACTIVE_BULK)
 231 };
 232
 233
 234 /*
 235  * Route cache.
 236  */
 237
 238 /* The locking scheme is rather straight forward:
 239  *
 240  * 1) Read-Copy Update protects the buckets of the central route hash.
 241  * 2) Only writers remove entries, and they hold the lock
 242  *    as they look at rtable reference counts.
 243  * 3) Only readers acquire references to rtable entries,
 244  *    they do so with atomic increments and with the
 245  *    lock held.
 246  */
 247
 248 struct rt_hash_bucket {
 249         struct rtable __rcu     *chain;
 250 };
 251
 252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 253         defined(CONFIG_PROVE_LOCKING)
 254 /*
 255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 256  * The size of this table is a power of two and depends on the number of CPUS.
 257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 258  */
 259 #ifdef CONFIG_LOCKDEP
 260 # define RT_HASH_LOCK_SZ        256
 261 #else
 262 # if NR_CPUS >= 32
 263 #  define RT_HASH_LOCK_SZ       4096
 264 # elif NR_CPUS >= 16
 265 #  define RT_HASH_LOCK_SZ       2048
 266 # elif NR_CPUS >= 8
 267 #  define RT_HASH_LOCK_SZ       1024
 268 # elif NR_CPUS >= 4
 269 #  define RT_HASH_LOCK_SZ       512
 270 # else
 271 #  define RT_HASH_LOCK_SZ       256
 272 # endif
 273 #endif
 274
 275 static spinlock_t       *rt_hash_locks;
 276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 277
 278 static __init void rt_hash_lock_init(void)
 279 {
 280         int i;
 281
 282         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 283                         GFP_KERNEL);
 284         if (!rt_hash_locks)
 285                 panic("IP: failed to allocate rt_hash_locks\n");
 286
 287         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 288                 spin_lock_init(&rt_hash_locks[i]);
 289 }
 290 #else
 291 # define rt_hash_lock_addr(slot) NULL
 292
 293 static inline void rt_hash_lock_init(void)
 294 {
 295 }
 296 #endif
 297
 298 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 299 static unsigned                 rt_hash_mask __read_mostly;
 300 static unsigned int             rt_hash_log  __read_mostly;
 301
 302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 304
 305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 306                                    int genid)
 307 {
 308         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 309                             idx, genid)
 310                 & rt_hash_mask;
 311 }
 312
 313 static inline int rt_genid(struct net *net)
 314 {
 315         return atomic_read(&net->ipv4.rt_genid);
 316 }
 317
 318 #ifdef CONFIG_PROC_FS
 319 struct rt_cache_iter_state {
 320         struct seq_net_private p;
 321         int bucket;
 322         int genid;
 323 };
 324
 325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328         struct rtable *r = NULL;
 329
 330         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 331                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 332                         continue;
 333                 rcu_read_lock_bh();
 334                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 335                 while (r) {
 336                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 337                             r->rt_genid == st->genid)
 338                                 return r;
 339                         r = rcu_dereference_bh(r->dst.rt_next);
 340                 }
 341                 rcu_read_unlock_bh();
 342         }
 343         return r;
 344 }
 345
 346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 347                                           struct rtable *r)
 348 {
 349         struct rt_cache_iter_state *st = seq->private;
 350
 351         r = rcu_dereference_bh(r->dst.rt_next);
 352         while (!r) {
 353                 rcu_read_unlock_bh();
 354                 do {
 355                         if (--st->bucket < 0)
 356                                 return NULL;
 357                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 358                 rcu_read_lock_bh();
 359                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 360         }
 361         return r;
 362 }
 363
 364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 365                                         struct rtable *r)
 366 {
 367         struct rt_cache_iter_state *st = seq->private;
 368         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 369                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 370                         continue;
 371                 if (r->rt_genid == st->genid)
 372                         break;
 373         }
 374         return r;
 375 }
 376
 377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 378 {
 379         struct rtable *r = rt_cache_get_first(seq);
 380
 381         if (r)
 382                 while (pos && (r = rt_cache_get_next(seq, r)))
 383                         --pos;
 384         return pos ? NULL : r;
 385 }
 386
 387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 388 {
 389         struct rt_cache_iter_state *st = seq->private;
 390         if (*pos)
 391                 return rt_cache_get_idx(seq, *pos - 1);
 392         st->genid = rt_genid(seq_file_net(seq));
 393         return SEQ_START_TOKEN;
 394 }
 395
 396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 397 {
 398         struct rtable *r;
 399
 400         if (v == SEQ_START_TOKEN)
 401                 r = rt_cache_get_first(seq);
 402         else
 403                 r = rt_cache_get_next(seq, v);
 404         ++*pos;
 405         return r;
 406 }
 407
 408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 409 {
 410         if (v && v != SEQ_START_TOKEN)
 411                 rcu_read_unlock_bh();
 412 }
 413
 414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 415 {
 416         if (v == SEQ_START_TOKEN)
 417                 seq_printf(seq, "%-127s\n",
 418                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 419                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 420                            "HHUptod\tSpecDst");
 421         else {
 422                 struct rtable *r = v;
 423                 struct neighbour *n;
 424                 int len, HHUptod;
 425
 426                 rcu_read_lock();
 427                 n = dst_get_neighbour_noref(&r->dst);
 428                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 429                 rcu_read_unlock();
 430
 431                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 432                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 433                         r->dst.dev ? r->dst.dev->name : "*",
 434                         (__force u32)r->rt_dst,
 435                         (__force u32)r->rt_gateway,
 436                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 437                         r->dst.__use, 0, (__force u32)r->rt_src,
 438                         dst_metric_advmss(&r->dst) + 40,
 439                         dst_metric(&r->dst, RTAX_WINDOW),
 440                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 441                               dst_metric(&r->dst, RTAX_RTTVAR)),
 442                         r->rt_key_tos,
 443                         -1,
 444                         HHUptod,
 445                         r->rt_spec_dst, &len);
 446
 447                 seq_printf(seq, "%*s\n", 127 - len, "");
 448         }
 449         return 0;
 450 }
 451
 452 static const struct seq_operations rt_cache_seq_ops = {
 453         .start  = rt_cache_seq_start,
 454         .next   = rt_cache_seq_next,
 455         .stop   = rt_cache_seq_stop,
 456         .show   = rt_cache_seq_show,
 457 };
 458
 459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 460 {
 461         return seq_open_net(inode, file, &rt_cache_seq_ops,
 462                         sizeof(struct rt_cache_iter_state));
 463 }
 464
 465 static const struct file_operations rt_cache_seq_fops = {
 466         .owner   = THIS_MODULE,
 467         .open    = rt_cache_seq_open,
 468         .read    = seq_read,
 469         .llseek  = seq_lseek,
 470         .release = seq_release_net,
 471 };
 472
 473
 474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 475 {
 476         int cpu;
 477
 478         if (*pos == 0)
 479                 return SEQ_START_TOKEN;
 480
 481         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 482                 if (!cpu_possible(cpu))
 483                         continue;
 484                 *pos = cpu+1;
 485                 return &per_cpu(rt_cache_stat, cpu);
 486         }
 487         return NULL;
 488 }
 489
 490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 491 {
 492         int cpu;
 493
 494         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 495                 if (!cpu_possible(cpu))
 496                         continue;
 497                 *pos = cpu+1;
 498                 return &per_cpu(rt_cache_stat, cpu);
 499         }
 500         return NULL;
 501
 502 }
 503
 504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 505 {
 506
 507 }
 508
 509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 510 {
 511         struct rt_cache_stat *st = v;
 512
 513         if (v == SEQ_START_TOKEN) {
 514                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 515                 return 0;
 516         }
 517
 518         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 519                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 520                    dst_entries_get_slow(&ipv4_dst_ops),
 521                    st->in_hit,
 522                    st->in_slow_tot,
 523                    st->in_slow_mc,
 524                    st->in_no_route,
 525                    st->in_brd,
 526                    st->in_martian_dst,
 527                    st->in_martian_src,
 528
 529                    st->out_hit,
 530                    st->out_slow_tot,
 531                    st->out_slow_mc,
 532
 533                    st->gc_total,
 534                    st->gc_ignored,
 535                    st->gc_goal_miss,
 536                    st->gc_dst_overflow,
 537                    st->in_hlist_search,
 538                    st->out_hlist_search
 539                 );
 540         return 0;
 541 }
 542
 543 static const struct seq_operations rt_cpu_seq_ops = {
 544         .start  = rt_cpu_seq_start,
 545         .next   = rt_cpu_seq_next,
 546         .stop   = rt_cpu_seq_stop,
 547         .show   = rt_cpu_seq_show,
 548 };
 549
 550
 551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 552 {
 553         return seq_open(file, &rt_cpu_seq_ops);
 554 }
 555
 556 static const struct file_operations rt_cpu_seq_fops = {
 557         .owner   = THIS_MODULE,
 558         .open    = rt_cpu_seq_open,
 559         .read    = seq_read,
 560         .llseek  = seq_lseek,
 561         .release = seq_release,
 562 };
 563
 564 #ifdef CONFIG_IP_ROUTE_CLASSID
 565 static int rt_acct_proc_show(struct seq_file *m, void *v)
 566 {
 567         struct ip_rt_acct *dst, *src;
 568         unsigned int i, j;
 569
 570         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 571         if (!dst)
 572                 return -ENOMEM;
 573
 574         for_each_possible_cpu(i) {
 575                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 576                 for (j = 0; j < 256; j++) {
 577                         dst[j].o_bytes   += src[j].o_bytes;
 578                         dst[j].o_packets += src[j].o_packets;
 579                         dst[j].i_bytes   += src[j].i_bytes;
 580                         dst[j].i_packets += src[j].i_packets;
 581                 }
 582         }
 583
 584         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 585         kfree(dst);
 586         return 0;
 587 }
 588
 589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 590 {
 591         return single_open(file, rt_acct_proc_show, NULL);
 592 }
 593
 594 static const struct file_operations rt_acct_proc_fops = {
 595         .owner          = THIS_MODULE,
 596         .open           = rt_acct_proc_open,
 597         .read           = seq_read,
 598         .llseek         = seq_lseek,
 599         .release        = single_release,
 600 };
 601 #endif
 602
 603 static int __net_init ip_rt_do_proc_init(struct net *net)
 604 {
 605         struct proc_dir_entry *pde;
 606
 607         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 608                         &rt_cache_seq_fops);
 609         if (!pde)
 610                 goto err1;
 611
 612         pde = proc_create("rt_cache", S_IRUGO,
 613                           net->proc_net_stat, &rt_cpu_seq_fops);
 614         if (!pde)
 615                 goto err2;
 616
 617 #ifdef CONFIG_IP_ROUTE_CLASSID
 618         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 619         if (!pde)
 620                 goto err3;
 621 #endif
 622         return 0;
 623
 624 #ifdef CONFIG_IP_ROUTE_CLASSID
 625 err3:
 626         remove_proc_entry("rt_cache", net->proc_net_stat);
 627 #endif
 628 err2:
 629         remove_proc_entry("rt_cache", net->proc_net);
 630 err1:
 631         return -ENOMEM;
 632 }
 633
 634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 635 {
 636         remove_proc_entry("rt_cache", net->proc_net_stat);
 637         remove_proc_entry("rt_cache", net->proc_net);
 638 #ifdef CONFIG_IP_ROUTE_CLASSID
 639         remove_proc_entry("rt_acct", net->proc_net);
 640 #endif
 641 }
 642
 643 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 644         .init = ip_rt_do_proc_init,
 645         .exit = ip_rt_do_proc_exit,
 646 };
 647
 648 static int __init ip_rt_proc_init(void)
 649 {
 650         return register_pernet_subsys(&ip_rt_proc_ops);
 651 }
 652
 653 #else
 654 static inline int ip_rt_proc_init(void)
 655 {
 656         return 0;
 657 }
 658 #endif /* CONFIG_PROC_FS */
 659
 660 static inline void rt_free(struct rtable *rt)
 661 {
 662         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 663 }
 664
 665 static inline void rt_drop(struct rtable *rt)
 666 {
 667         ip_rt_put(rt);
 668         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 669 }
 670
 671 static inline int rt_fast_clean(struct rtable *rth)
 672 {
 673         /* Kill broadcast/multicast entries very aggresively, if they
 674            collide in hash table with more useful entries */
 675         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 676                 rt_is_input_route(rth) && rth->dst.rt_next;
 677 }
 678
 679 static inline int rt_valuable(struct rtable *rth)
 680 {
 681         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 682                 (rth->peer && rth->peer->pmtu_expires);
 683 }
 684
 685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 686 {
 687         unsigned long age;
 688         int ret = 0;
 689
 690         if (atomic_read(&rth->dst.__refcnt))
 691                 goto out;
 692
 693         age = jiffies - rth->dst.lastuse;
 694         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 695             (age <= tmo2 && rt_valuable(rth)))
 696                 goto out;
 697         ret = 1;
 698 out:    return ret;
 699 }
 700
 701 /* Bits of score are:
 702  * 31: very valuable
 703  * 30: not quite useless
 704  * 29..0: usage counter
 705  */
 706 static inline u32 rt_score(struct rtable *rt)
 707 {
 708         u32 score = jiffies - rt->dst.lastuse;
 709
 710         score = ~score & ~(3<<30);
 711
 712         if (rt_valuable(rt))
 713                 score |= (1<<31);
 714
 715         if (rt_is_output_route(rt) ||
 716             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 717                 score |= (1<<30);
 718
 719         return score;
 720 }
 721
 722 static inline bool rt_caching(const struct net *net)
 723 {
 724         return net->ipv4.current_rt_cache_rebuild_count <=
 725                 net->ipv4.sysctl_rt_cache_rebuild_count;
 726 }
 727
 728 static inline bool compare_hash_inputs(const struct rtable *rt1,
 729                                        const struct rtable *rt2)
 730 {
 731         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 732                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 733                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 734 }
 735
 736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 737 {
 738         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 739                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 740                 (rt1->rt_mark ^ rt2->rt_mark) |
 741                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 742                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 743                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 744 }
 745
 746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 747 {
 748         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 749 }
 750
 751 static inline int rt_is_expired(struct rtable *rth)
 752 {
 753         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 754 }
 755
 756 /*
 757  * Perform a full scan of hash table and free all entries.
 758  * Can be called by a softirq or a process.
 759  * In the later case, we want to be reschedule if necessary
 760  */
 761 static void rt_do_flush(struct net *net, int process_context)
 762 {
 763         unsigned int i;
 764         struct rtable *rth, *next;
 765
 766         for (i = 0; i <= rt_hash_mask; i++) {
 767                 struct rtable __rcu **pprev;
 768                 struct rtable *list;
 769
 770                 if (process_context && need_resched())
 771                         cond_resched();
 772                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 773                 if (!rth)
 774                         continue;
 775
 776                 spin_lock_bh(rt_hash_lock_addr(i));
 777
 778                 list = NULL;
 779                 pprev = &rt_hash_table[i].chain;
 780                 rth = rcu_dereference_protected(*pprev,
 781                         lockdep_is_held(rt_hash_lock_addr(i)));
 782
 783                 while (rth) {
 784                         next = rcu_dereference_protected(rth->dst.rt_next,
 785                                 lockdep_is_held(rt_hash_lock_addr(i)));
 786
 787                         if (!net ||
 788                             net_eq(dev_net(rth->dst.dev), net)) {
 789                                 rcu_assign_pointer(*pprev, next);
 790                                 rcu_assign_pointer(rth->dst.rt_next, list);
 791                                 list = rth;
 792                         } else {
 793                                 pprev = &rth->dst.rt_next;
 794                         }
 795                         rth = next;
 796                 }
 797
 798                 spin_unlock_bh(rt_hash_lock_addr(i));
 799
 800                 for (; list; list = next) {
 801                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 802                         rt_free(list);
 803                 }
 804         }
 805 }
 806
 807 /*
 808  * While freeing expired entries, we compute average chain length
 809  * and standard deviation, using fixed-point arithmetic.
 810  * This to have an estimation of rt_chain_length_max
 811  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 812  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 813  */
 814
 815 #define FRACT_BITS 3
 816 #define ONE (1UL << FRACT_BITS)
 817
 818 /*
 819  * Given a hash chain and an item in this hash chain,
 820  * find if a previous entry has the same hash_inputs
 821  * (but differs on tos, mark or oif)
 822  * Returns 0 if an alias is found.
 823  * Returns ONE if rth has no alias before itself.
 824  */
 825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 826 {
 827         const struct rtable *aux = head;
 828
 829         while (aux != rth) {
 830                 if (compare_hash_inputs(aux, rth))
 831                         return 0;
 832                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 833         }
 834         return ONE;
 835 }
 836
 837 static void rt_check_expire(void)
 838 {
 839         static unsigned int rover;
 840         unsigned int i = rover, goal;
 841         struct rtable *rth;
 842         struct rtable __rcu **rthp;
 843         unsigned long samples = 0;
 844         unsigned long sum = 0, sum2 = 0;
 845         unsigned long delta;
 846         u64 mult;
 847
 848         delta = jiffies - expires_ljiffies;
 849         expires_ljiffies = jiffies;
 850         mult = ((u64)delta) << rt_hash_log;
 851         if (ip_rt_gc_timeout > 1)
 852                 do_div(mult, ip_rt_gc_timeout);
 853         goal = (unsigned int)mult;
 854         if (goal > rt_hash_mask)
 855                 goal = rt_hash_mask + 1;
 856         for (; goal > 0; goal--) {
 857                 unsigned long tmo = ip_rt_gc_timeout;
 858                 unsigned long length;
 859
 860                 i = (i + 1) & rt_hash_mask;
 861                 rthp = &rt_hash_table[i].chain;
 862
 863                 if (need_resched())
 864                         cond_resched();
 865
 866                 samples++;
 867
 868                 if (rcu_dereference_raw(*rthp) == NULL)
 869                         continue;
 870                 length = 0;
 871                 spin_lock_bh(rt_hash_lock_addr(i));
 872                 while ((rth = rcu_dereference_protected(*rthp,
 873                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 874                         prefetch(rth->dst.rt_next);
 875                         if (rt_is_expired(rth)) {
 876                                 *rthp = rth->dst.rt_next;
 877                                 rt_free(rth);
 878                                 continue;
 879                         }
 880                         if (rth->dst.expires) {
 881                                 /* Entry is expired even if it is in use */
 882                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 883 nofree:
 884                                         tmo >>= 1;
 885                                         rthp = &rth->dst.rt_next;
 886                                         /*
 887                                          * We only count entries on
 888                                          * a chain with equal hash inputs once
 889                                          * so that entries for different QOS
 890                                          * levels, and other non-hash input
 891                                          * attributes don't unfairly skew
 892                                          * the length computation
 893                                          */
 894                                         length += has_noalias(rt_hash_table[i].chain, rth);
 895                                         continue;
 896                                 }
 897                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 898                                 goto nofree;
 899
 900                         /* Cleanup aged off entries. */
 901                         *rthp = rth->dst.rt_next;
 902                         rt_free(rth);
 903                 }
 904                 spin_unlock_bh(rt_hash_lock_addr(i));
 905                 sum += length;
 906                 sum2 += length*length;
 907         }
 908         if (samples) {
 909                 unsigned long avg = sum / samples;
 910                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 911                 rt_chain_length_max = max_t(unsigned long,
 912                                         ip_rt_gc_elasticity,
 913                                         (avg + 4*sd) >> FRACT_BITS);
 914         }
 915         rover = i;
 916 }
 917
 918 /*
 919  * rt_worker_func() is run in process context.
 920  * we call rt_check_expire() to scan part of the hash table
 921  */
 922 static void rt_worker_func(struct work_struct *work)
 923 {
 924         rt_check_expire();
 925         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 926 }
 927
 928 /*
 929  * Perturbation of rt_genid by a small quantity [1..256]
 930  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 931  * many times (2^24) without giving recent rt_genid.
 932  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 933  */
 934 static void rt_cache_invalidate(struct net *net)
 935 {
 936         unsigned char shuffle;
 937
 938         get_random_bytes(&shuffle, sizeof(shuffle));
 939         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 940         redirect_genid++;
 941         inetpeer_invalidate_tree(AF_INET);
 942 }
 943
 944 /*
 945  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 946  * delay >= 0 : invalidate & flush cache (can be long)
 947  */
 948 void rt_cache_flush(struct net *net, int delay)
 949 {
 950         rt_cache_invalidate(net);
 951         if (delay >= 0)
 952                 rt_do_flush(net, !in_softirq());
 953 }
 954
 955 /* Flush previous cache invalidated entries from the cache */
 956 void rt_cache_flush_batch(struct net *net)
 957 {
 958         rt_do_flush(net, !in_softirq());
 959 }
 960
 961 static void rt_emergency_hash_rebuild(struct net *net)
 962 {
 963         if (net_ratelimit())
 964                 printk(KERN_WARNING "Route hash chain too long!\n");
 965         rt_cache_invalidate(net);
 966 }
 967
 968 /*
 969    Short description of GC goals.
 970
 971    We want to build algorithm, which will keep routing cache
 972    at some equilibrium point, when number of aged off entries
 973    is kept approximately equal to newly generated ones.
 974
 975    Current expiration strength is variable "expire".
 976    We try to adjust it dynamically, so that if networking
 977    is idle expires is large enough to keep enough of warm entries,
 978    and when load increases it reduces to limit cache size.
 979  */
 980
 981 static int rt_garbage_collect(struct dst_ops *ops)
 982 {
 983         static unsigned long expire = RT_GC_TIMEOUT;
 984         static unsigned long last_gc;
 985         static int rover;
 986         static int equilibrium;
 987         struct rtable *rth;
 988         struct rtable __rcu **rthp;
 989         unsigned long now = jiffies;
 990         int goal;
 991         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 992
 993         /*
 994          * Garbage collection is pretty expensive,
 995          * do not make it too frequently.
 996          */
 997
 998         RT_CACHE_STAT_INC(gc_total);
 999
1000         if (now - last_gc < ip_rt_gc_min_interval &&
1001             entries < ip_rt_max_size) {
1002                 RT_CACHE_STAT_INC(gc_ignored);
1003                 goto out;
1004         }
1005
1006         entries = dst_entries_get_slow(&ipv4_dst_ops);
1007         /* Calculate number of entries, which we want to expire now. */
1008         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1009         if (goal <= 0) {
1010                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011                         equilibrium = ipv4_dst_ops.gc_thresh;
1012                 goal = entries - equilibrium;
1013                 if (goal > 0) {
1014                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                         goal = entries - equilibrium;
1016                 }
1017         } else {
1018                 /* We are in dangerous area. Try to reduce cache really
1019                  * aggressively.
1020                  */
1021                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022                 equilibrium = entries - goal;
1023         }
1024
1025         if (now - last_gc >= ip_rt_gc_min_interval)
1026                 last_gc = now;
1027
1028         if (goal <= 0) {
1029                 equilibrium += goal;
1030                 goto work_done;
1031         }
1032
1033         do {
1034                 int i, k;
1035
1036                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037                         unsigned long tmo = expire;
1038
1039                         k = (k + 1) & rt_hash_mask;
1040                         rthp = &rt_hash_table[k].chain;
1041                         spin_lock_bh(rt_hash_lock_addr(k));
1042                         while ((rth = rcu_dereference_protected(*rthp,
1043                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1044                                 if (!rt_is_expired(rth) &&
1045                                         !rt_may_expire(rth, tmo, expire)) {
1046                                         tmo >>= 1;
1047                                         rthp = &rth->dst.rt_next;
1048                                         continue;
1049                                 }
1050                                 *rthp = rth->dst.rt_next;
1051                                 rt_free(rth);
1052                                 goal--;
1053                         }
1054                         spin_unlock_bh(rt_hash_lock_addr(k));
1055                         if (goal <= 0)
1056                                 break;
1057                 }
1058                 rover = k;
1059
1060                 if (goal <= 0)
1061                         goto work_done;
1062
1063                 /* Goal is not achieved. We stop process if:
1064
1065                    - if expire reduced to zero. Otherwise, expire is halfed.
1066                    - if table is not full.
1067                    - if we are called from interrupt.
1068                    - jiffies check is just fallback/debug loop breaker.
1069                      We will not spin here for long time in any case.
1070                  */
1071
1072                 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074                 if (expire == 0)
1075                         break;
1076
1077                 expire >>= 1;
1078
1079                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                         goto out;
1081         } while (!in_softirq() && time_before_eq(jiffies, now));
1082
1083         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084                 goto out;
1085         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1086                 goto out;
1087         if (net_ratelimit())
1088                 printk(KERN_WARNING "dst cache overflow\n");
1089         RT_CACHE_STAT_INC(gc_dst_overflow);
1090         return 1;
1091
1092 work_done:
1093         expire += ip_rt_gc_min_interval;
1094         if (expire > ip_rt_gc_timeout ||
1095             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1097                 expire = ip_rt_gc_timeout;
1098 out:    return 0;
1099 }
1100
1101 /*
1102  * Returns number of entries in a hash chain that have different hash_inputs
1103  */
1104 static int slow_chain_length(const struct rtable *head)
1105 {
1106         int length = 0;
1107         const struct rtable *rth = head;
1108
1109         while (rth) {
1110                 length += has_noalias(head, rth);
1111                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1112         }
1113         return length >> FRACT_BITS;
1114 }
1115
1116 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1117 {
1118         static const __be32 inaddr_any = 0;
1119         struct net_device *dev = dst->dev;
1120         const __be32 *pkey = daddr;
1121         struct neighbour *n;
1122
1123         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1124                 pkey = &inaddr_any;
1125
1126         n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1127         if (n)
1128                 return n;
1129         return neigh_create(&arp_tbl, pkey, dev);
1130 }
1131
1132 static int rt_bind_neighbour(struct rtable *rt)
1133 {
1134         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1135         if (IS_ERR(n))
1136                 return PTR_ERR(n);
1137         dst_set_neighbour(&rt->dst, n);
1138
1139         return 0;
1140 }
1141
1142 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1143                                      struct sk_buff *skb, int ifindex)
1144 {
1145         struct rtable   *rth, *cand;
1146         struct rtable __rcu **rthp, **candp;
1147         unsigned long   now;
1148         u32             min_score;
1149         int             chain_length;
1150         int attempts = !in_softirq();
1151
1152 restart:
1153         chain_length = 0;
1154         min_score = ~(u32)0;
1155         cand = NULL;
1156         candp = NULL;
1157         now = jiffies;
1158
1159         if (!rt_caching(dev_net(rt->dst.dev))) {
1160                 /*
1161                  * If we're not caching, just tell the caller we
1162                  * were successful and don't touch the route.  The
1163                  * caller hold the sole reference to the cache entry, and
1164                  * it will be released when the caller is done with it.
1165                  * If we drop it here, the callers have no way to resolve routes
1166                  * when we're not caching.  Instead, just point *rp at rt, so
1167                  * the caller gets a single use out of the route
1168                  * Note that we do rt_free on this new route entry, so that
1169                  * once its refcount hits zero, we are still able to reap it
1170                  * (Thanks Alexey)
1171                  * Note: To avoid expensive rcu stuff for this uncached dst,
1172                  * we set DST_NOCACHE so that dst_release() can free dst without
1173                  * waiting a grace period.
1174                  */
1175
1176                 rt->dst.flags |= DST_NOCACHE;
1177                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                         int err = rt_bind_neighbour(rt);
1179                         if (err) {
1180                                 if (net_ratelimit())
1181                                         printk(KERN_WARNING
1182                                             "Neighbour table failure & not caching routes.\n");
1183                                 ip_rt_put(rt);
1184                                 return ERR_PTR(err);
1185                         }
1186                 }
1187
1188                 goto skip_hashing;
1189         }
1190
1191         rthp = &rt_hash_table[hash].chain;
1192
1193         spin_lock_bh(rt_hash_lock_addr(hash));
1194         while ((rth = rcu_dereference_protected(*rthp,
1195                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1196                 if (rt_is_expired(rth)) {
1197                         *rthp = rth->dst.rt_next;
1198                         rt_free(rth);
1199                         continue;
1200                 }
1201                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1202                         /* Put it first */
1203                         *rthp = rth->dst.rt_next;
1204                         /*
1205                          * Since lookup is lockfree, the deletion
1206                          * must be visible to another weakly ordered CPU before
1207                          * the insertion at the start of the hash chain.
1208                          */
1209                         rcu_assign_pointer(rth->dst.rt_next,
1210                                            rt_hash_table[hash].chain);
1211                         /*
1212                          * Since lookup is lockfree, the update writes
1213                          * must be ordered for consistency on SMP.
1214                          */
1215                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1216
1217                         dst_use(&rth->dst, now);
1218                         spin_unlock_bh(rt_hash_lock_addr(hash));
1219
1220                         rt_drop(rt);
1221                         if (skb)
1222                                 skb_dst_set(skb, &rth->dst);
1223                         return rth;
1224                 }
1225
1226                 if (!atomic_read(&rth->dst.__refcnt)) {
1227                         u32 score = rt_score(rth);
1228
1229                         if (score <= min_score) {
1230                                 cand = rth;
1231                                 candp = rthp;
1232                                 min_score = score;
1233                         }
1234                 }
1235
1236                 chain_length++;
1237
1238                 rthp = &rth->dst.rt_next;
1239         }
1240
1241         if (cand) {
1242                 /* ip_rt_gc_elasticity used to be average length of chain
1243                  * length, when exceeded gc becomes really aggressive.
1244                  *
1245                  * The second limit is less certain. At the moment it allows
1246                  * only 2 entries per bucket. We will see.
1247                  */
1248                 if (chain_length > ip_rt_gc_elasticity) {
1249                         *candp = cand->dst.rt_next;
1250                         rt_free(cand);
1251                 }
1252         } else {
1253                 if (chain_length > rt_chain_length_max &&
1254                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1255                         struct net *net = dev_net(rt->dst.dev);
1256                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1257                         if (!rt_caching(net)) {
1258                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1259                                         rt->dst.dev->name, num);
1260                         }
1261                         rt_emergency_hash_rebuild(net);
1262                         spin_unlock_bh(rt_hash_lock_addr(hash));
1263
1264                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1265                                         ifindex, rt_genid(net));
1266                         goto restart;
1267                 }
1268         }
1269
1270         /* Try to bind route to arp only if it is output
1271            route or unicast forwarding path.
1272          */
1273         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1274                 int err = rt_bind_neighbour(rt);
1275                 if (err) {
1276                         spin_unlock_bh(rt_hash_lock_addr(hash));
1277
1278                         if (err != -ENOBUFS) {
1279                                 rt_drop(rt);
1280                                 return ERR_PTR(err);
1281                         }
1282
1283                         /* Neighbour tables are full and nothing
1284                            can be released. Try to shrink route cache,
1285                            it is most likely it holds some neighbour records.
1286                          */
1287                         if (attempts-- > 0) {
1288                                 int saved_elasticity = ip_rt_gc_elasticity;
1289                                 int saved_int = ip_rt_gc_min_interval;
1290                                 ip_rt_gc_elasticity     = 1;
1291                                 ip_rt_gc_min_interval   = 0;
1292                                 rt_garbage_collect(&ipv4_dst_ops);
1293                                 ip_rt_gc_min_interval   = saved_int;
1294                                 ip_rt_gc_elasticity     = saved_elasticity;
1295                                 goto restart;
1296                         }
1297
1298                         if (net_ratelimit())
1299                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1300                         rt_drop(rt);
1301                         return ERR_PTR(-ENOBUFS);
1302                 }
1303         }
1304
1305         rt->dst.rt_next = rt_hash_table[hash].chain;
1306
1307         /*
1308          * Since lookup is lockfree, we must make sure
1309          * previous writes to rt are committed to memory
1310          * before making rt visible to other CPUS.
1311          */
1312         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1313
1314         spin_unlock_bh(rt_hash_lock_addr(hash));
1315
1316 skip_hashing:
1317         if (skb)
1318                 skb_dst_set(skb, &rt->dst);
1319         return rt;
1320 }
1321
1322 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324 static u32 rt_peer_genid(void)
1325 {
1326         return atomic_read(&__rt_peer_genid);
1327 }
1328
1329 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330 {
1331         struct inet_peer *peer;
1332
1333         peer = inet_getpeer_v4(daddr, create);
1334
1335         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1336                 inet_putpeer(peer);
1337         else
1338                 rt->rt_peer_genid = rt_peer_genid();
1339 }
1340
1341 /*
1342  * Peer allocation may fail only in serious out-of-memory conditions.  However
1343  * we still can generate some output.
1344  * Random ID selection looks a bit dangerous because we have no chances to
1345  * select ID being unique in a reasonable period of time.
1346  * But broken packet identifier may be better than no packet at all.
1347  */
1348 static void ip_select_fb_ident(struct iphdr *iph)
1349 {
1350         static DEFINE_SPINLOCK(ip_fb_id_lock);
1351         static u32 ip_fallback_id;
1352         u32 salt;
1353
1354         spin_lock_bh(&ip_fb_id_lock);
1355         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1356         iph->id = htons(salt & 0xFFFF);
1357         ip_fallback_id = salt;
1358         spin_unlock_bh(&ip_fb_id_lock);
1359 }
1360
1361 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362 {
1363         struct rtable *rt = (struct rtable *) dst;
1364
1365         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1366                 if (rt->peer == NULL)
1367                         rt_bind_peer(rt, rt->rt_dst, 1);
1368
1369                 /* If peer is attached to destination, it is never detached,
1370                    so that we need not to grab a lock to dereference it.
1371                  */
1372                 if (rt->peer) {
1373                         iph->id = htons(inet_getid(rt->peer, more));
1374                         return;
1375                 }
1376         } else if (!rt)
1377                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1378                        __builtin_return_address(0));
1379
1380         ip_select_fb_ident(iph);
1381 }
1382 EXPORT_SYMBOL(__ip_select_ident);
1383
1384 static void rt_del(unsigned hash, struct rtable *rt)
1385 {
1386         struct rtable __rcu **rthp;
1387         struct rtable *aux;
1388
1389         rthp = &rt_hash_table[hash].chain;
1390         spin_lock_bh(rt_hash_lock_addr(hash));
1391         ip_rt_put(rt);
1392         while ((aux = rcu_dereference_protected(*rthp,
1393                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1394                 if (aux == rt || rt_is_expired(aux)) {
1395                         *rthp = aux->dst.rt_next;
1396                         rt_free(aux);
1397                         continue;
1398                 }
1399                 rthp = &aux->dst.rt_next;
1400         }
1401         spin_unlock_bh(rt_hash_lock_addr(hash));
1402 }
1403
1404 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1405 {
1406         struct rtable *rt = (struct rtable *) dst;
1407         __be32 orig_gw = rt->rt_gateway;
1408         struct neighbour *n, *old_n;
1409
1410         dst_confirm(&rt->dst);
1411
1412         rt->rt_gateway = peer->redirect_learned.a4;
1413
1414         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1415         if (IS_ERR(n)) {
1416                 rt->rt_gateway = orig_gw;
1417                 return;
1418         }
1419         old_n = xchg(&rt->dst._neighbour, n);
1420         if (old_n)
1421                 neigh_release(old_n);
1422         if (!(n->nud_state & NUD_VALID)) {
1423                 neigh_event_send(n, NULL);
1424         } else {
1425                 rt->rt_flags |= RTCF_REDIRECTED;
1426                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1427         }
1428 }
1429
1430 /* called in rcu_read_lock() section */
1431 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432                     __be32 saddr, struct net_device *dev)
1433 {
1434         int s, i;
1435         struct in_device *in_dev = __in_dev_get_rcu(dev);
1436         __be32 skeys[2] = { saddr, 0 };
1437         int    ikeys[2] = { dev->ifindex, 0 };
1438         struct inet_peer *peer;
1439         struct net *net;
1440
1441         if (!in_dev)
1442                 return;
1443
1444         net = dev_net(dev);
1445         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1446             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1447             ipv4_is_zeronet(new_gw))
1448                 goto reject_redirect;
1449
1450         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1451                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1452                         goto reject_redirect;
1453                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1454                         goto reject_redirect;
1455         } else {
1456                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1457                         goto reject_redirect;
1458         }
1459
1460         for (s = 0; s < 2; s++) {
1461                 for (i = 0; i < 2; i++) {
1462                         unsigned int hash;
1463                         struct rtable __rcu **rthp;
1464                         struct rtable *rt;
1465
1466                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1467
1468                         rthp = &rt_hash_table[hash].chain;
1469
1470                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1471                                 rthp = &rt->dst.rt_next;
1472
1473                                 if (rt->rt_key_dst != daddr ||
1474                                     rt->rt_key_src != skeys[s] ||
1475                                     rt->rt_oif != ikeys[i] ||
1476                                     rt_is_input_route(rt) ||
1477                                     rt_is_expired(rt) ||
1478                                     !net_eq(dev_net(rt->dst.dev), net) ||
1479                                     rt->dst.error ||
1480                                     rt->dst.dev != dev ||
1481                                     rt->rt_gateway != old_gw)
1482                                         continue;
1483
1484                                 if (!rt->peer)
1485                                         rt_bind_peer(rt, rt->rt_dst, 1);
1486
1487                                 peer = rt->peer;
1488                                 if (peer) {
1489                                         if (peer->redirect_learned.a4 != new_gw ||
1490                                             peer->redirect_genid != redirect_genid) {
1491                                                 peer->redirect_learned.a4 = new_gw;
1492                                                 peer->redirect_genid = redirect_genid;
1493                                                 atomic_inc(&__rt_peer_genid);
1494                                         }
1495                                         check_peer_redir(&rt->dst, peer);
1496                                 }
1497                         }
1498                 }
1499         }
1500         return;
1501
1502 reject_redirect:
1503 #ifdef CONFIG_IP_ROUTE_VERBOSE
1504         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1505                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1506                         "  Advised path = %pI4 -> %pI4\n",
1507                        &old_gw, dev->name, &new_gw,
1508                        &saddr, &daddr);
1509 #endif
1510         ;
1511 }
1512
1513 static bool peer_pmtu_expired(struct inet_peer *peer)
1514 {
1515         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1516
1517         return orig &&
1518                time_after_eq(jiffies, orig) &&
1519                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1520 }
1521
1522 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1523 {
1524         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1525
1526         return orig &&
1527                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1528 }
1529
1530 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1531 {
1532         struct rtable *rt = (struct rtable *)dst;
1533         struct dst_entry *ret = dst;
1534
1535         if (rt) {
1536                 if (dst->obsolete > 0) {
1537                         ip_rt_put(rt);
1538                         ret = NULL;
1539                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1540                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1541                                                 rt->rt_oif,
1542                                                 rt_genid(dev_net(dst->dev)));
1543                         rt_del(hash, rt);
1544                         ret = NULL;
1545                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1546                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1547                 }
1548         }
1549         return ret;
1550 }
1551
1552 /*
1553  * Algorithm:
1554  *      1. The first ip_rt_redirect_number redirects are sent
1555  *         with exponential backoff, then we stop sending them at all,
1556  *         assuming that the host ignores our redirects.
1557  *      2. If we did not see packets requiring redirects
1558  *         during ip_rt_redirect_silence, we assume that the host
1559  *         forgot redirected route and start to send redirects again.
1560  *
1561  * This algorithm is much cheaper and more intelligent than dumb load limiting
1562  * in icmp.c.
1563  *
1564  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1565  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1566  */
1567
1568 void ip_rt_send_redirect(struct sk_buff *skb)
1569 {
1570         struct rtable *rt = skb_rtable(skb);
1571         struct in_device *in_dev;
1572         struct inet_peer *peer;
1573         int log_martians;
1574
1575         rcu_read_lock();
1576         in_dev = __in_dev_get_rcu(rt->dst.dev);
1577         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1578                 rcu_read_unlock();
1579                 return;
1580         }
1581         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1582         rcu_read_unlock();
1583
1584         if (!rt->peer)
1585                 rt_bind_peer(rt, rt->rt_dst, 1);
1586         peer = rt->peer;
1587         if (!peer) {
1588                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1589                 return;
1590         }
1591
1592         /* No redirected packets during ip_rt_redirect_silence;
1593          * reset the algorithm.
1594          */
1595         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1596                 peer->rate_tokens = 0;
1597
1598         /* Too many ignored redirects; do not send anything
1599          * set dst.rate_last to the last seen redirected packet.
1600          */
1601         if (peer->rate_tokens >= ip_rt_redirect_number) {
1602                 peer->rate_last = jiffies;
1603                 return;
1604         }
1605
1606         /* Check for load limit; set rate_last to the latest sent
1607          * redirect.
1608          */
1609         if (peer->rate_tokens == 0 ||
1610             time_after(jiffies,
1611                        (peer->rate_last +
1612                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1613                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1614                 peer->rate_last = jiffies;
1615                 ++peer->rate_tokens;
1616 #ifdef CONFIG_IP_ROUTE_VERBOSE
1617                 if (log_martians &&
1618                     peer->rate_tokens == ip_rt_redirect_number &&
1619                     net_ratelimit())
1620                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1621                                &ip_hdr(skb)->saddr, rt->rt_iif,
1622                                 &rt->rt_dst, &rt->rt_gateway);
1623 #endif
1624         }
1625 }
1626
1627 static int ip_error(struct sk_buff *skb)
1628 {
1629         struct rtable *rt = skb_rtable(skb);
1630         struct inet_peer *peer;
1631         unsigned long now;
1632         bool send;
1633         int code;
1634
1635         switch (rt->dst.error) {
1636         case EINVAL:
1637         default:
1638                 goto out;
1639         case EHOSTUNREACH:
1640                 code = ICMP_HOST_UNREACH;
1641                 break;
1642         case ENETUNREACH:
1643                 code = ICMP_NET_UNREACH;
1644                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1645                                 IPSTATS_MIB_INNOROUTES);
1646                 break;
1647         case EACCES:
1648                 code = ICMP_PKT_FILTERED;
1649                 break;
1650         }
1651
1652         if (!rt->peer)
1653                 rt_bind_peer(rt, rt->rt_dst, 1);
1654         peer = rt->peer;
1655
1656         send = true;
1657         if (peer) {
1658                 now = jiffies;
1659                 peer->rate_tokens += now - peer->rate_last;
1660                 if (peer->rate_tokens > ip_rt_error_burst)
1661                         peer->rate_tokens = ip_rt_error_burst;
1662                 peer->rate_last = now;
1663                 if (peer->rate_tokens >= ip_rt_error_cost)
1664                         peer->rate_tokens -= ip_rt_error_cost;
1665                 else
1666                         send = false;
1667         }
1668         if (send)
1669                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1670
1671 out:    kfree_skb(skb);
1672         return 0;
1673 }
1674
1675 /*
1676  *      The last two values are not from the RFC but
1677  *      are needed for AMPRnet AX.25 paths.
1678  */
1679
1680 static const unsigned short mtu_plateau[] =
1681 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1682
1683 static inline unsigned short guess_mtu(unsigned short old_mtu)
1684 {
1685         int i;
1686
1687         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1688                 if (old_mtu > mtu_plateau[i])
1689                         return mtu_plateau[i];
1690         return 68;
1691 }
1692
1693 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1694                                  unsigned short new_mtu,
1695                                  struct net_device *dev)
1696 {
1697         unsigned short old_mtu = ntohs(iph->tot_len);
1698         unsigned short est_mtu = 0;
1699         struct inet_peer *peer;
1700
1701         peer = inet_getpeer_v4(iph->daddr, 1);
1702         if (peer) {
1703                 unsigned short mtu = new_mtu;
1704
1705                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1706                         /* BSD 4.2 derived systems incorrectly adjust
1707                          * tot_len by the IP header length, and report
1708                          * a zero MTU in the ICMP message.
1709                          */
1710                         if (mtu == 0 &&
1711                             old_mtu >= 68 + (iph->ihl << 2))
1712                                 old_mtu -= iph->ihl << 2;
1713                         mtu = guess_mtu(old_mtu);
1714                 }
1715
1716                 if (mtu < ip_rt_min_pmtu)
1717                         mtu = ip_rt_min_pmtu;
1718                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1719                         unsigned long pmtu_expires;
1720
1721                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1722                         if (!pmtu_expires)
1723                                 pmtu_expires = 1UL;
1724
1725                         est_mtu = mtu;
1726                         peer->pmtu_learned = mtu;
1727                         peer->pmtu_expires = pmtu_expires;
1728                         atomic_inc(&__rt_peer_genid);
1729                 }
1730
1731                 inet_putpeer(peer);
1732         }
1733         return est_mtu ? : new_mtu;
1734 }
1735
1736 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1737 {
1738         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1739
1740         if (!expires)
1741                 return;
1742         if (time_before(jiffies, expires)) {
1743                 u32 orig_dst_mtu = dst_mtu(dst);
1744                 if (peer->pmtu_learned < orig_dst_mtu) {
1745                         if (!peer->pmtu_orig)
1746                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1747                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1748                 }
1749         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1750                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1751 }
1752
1753 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1754 {
1755         struct rtable *rt = (struct rtable *) dst;
1756         struct inet_peer *peer;
1757
1758         dst_confirm(dst);
1759
1760         if (!rt->peer)
1761                 rt_bind_peer(rt, rt->rt_dst, 1);
1762         peer = rt->peer;
1763         if (peer) {
1764                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1765
1766                 if (mtu < ip_rt_min_pmtu)
1767                         mtu = ip_rt_min_pmtu;
1768                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1769
1770                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1771                         if (!pmtu_expires)
1772                                 pmtu_expires = 1UL;
1773
1774                         peer->pmtu_learned = mtu;
1775                         peer->pmtu_expires = pmtu_expires;
1776
1777                         atomic_inc(&__rt_peer_genid);
1778                         rt->rt_peer_genid = rt_peer_genid();
1779                 }
1780                 check_peer_pmtu(dst, peer);
1781         }
1782 }
1783
1784
1785 static void ipv4_validate_peer(struct rtable *rt)
1786 {
1787         if (rt->rt_peer_genid != rt_peer_genid()) {
1788                 struct inet_peer *peer;
1789
1790                 if (!rt->peer)
1791                         rt_bind_peer(rt, rt->rt_dst, 0);
1792
1793                 peer = rt->peer;
1794                 if (peer) {
1795                         check_peer_pmtu(&rt->dst, peer);
1796
1797                         if (peer->redirect_genid != redirect_genid)
1798                                 peer->redirect_learned.a4 = 0;
1799                         if (peer->redirect_learned.a4 &&
1800                             peer->redirect_learned.a4 != rt->rt_gateway)
1801                                 check_peer_redir(&rt->dst, peer);
1802                 }
1803
1804                 rt->rt_peer_genid = rt_peer_genid();
1805         }
1806 }
1807
1808 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1809 {
1810         struct rtable *rt = (struct rtable *) dst;
1811
1812         if (rt_is_expired(rt))
1813                 return NULL;
1814         ipv4_validate_peer(rt);
1815         return dst;
1816 }
1817
1818 static void ipv4_dst_destroy(struct dst_entry *dst)
1819 {
1820         struct rtable *rt = (struct rtable *) dst;
1821         struct inet_peer *peer = rt->peer;
1822
1823         if (rt->fi) {
1824                 fib_info_put(rt->fi);
1825                 rt->fi = NULL;
1826         }
1827         if (peer) {
1828                 rt->peer = NULL;
1829                 inet_putpeer(peer);
1830         }
1831 }
1832
1833
1834 static void ipv4_link_failure(struct sk_buff *skb)
1835 {
1836         struct rtable *rt;
1837
1838         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1839
1840         rt = skb_rtable(skb);
1841         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1842                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1843 }
1844
1845 static int ip_rt_bug(struct sk_buff *skb)
1846 {
1847         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1848                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1849                 skb->dev ? skb->dev->name : "?");
1850         kfree_skb(skb);
1851         WARN_ON(1);
1852         return 0;
1853 }
1854
1855 /*
1856    We do not cache source address of outgoing interface,
1857    because it is used only by IP RR, TS and SRR options,
1858    so that it out of fast path.
1859
1860    BTW remember: "addr" is allowed to be not aligned
1861    in IP options!
1862  */
1863
1864 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1865 {
1866         __be32 src;
1867
1868         if (rt_is_output_route(rt))
1869                 src = ip_hdr(skb)->saddr;
1870         else {
1871                 struct fib_result res;
1872                 struct flowi4 fl4;
1873                 struct iphdr *iph;
1874
1875                 iph = ip_hdr(skb);
1876
1877                 memset(&fl4, 0, sizeof(fl4));
1878                 fl4.daddr = iph->daddr;
1879                 fl4.saddr = iph->saddr;
1880                 fl4.flowi4_tos = RT_TOS(iph->tos);
1881                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1882                 fl4.flowi4_iif = skb->dev->ifindex;
1883                 fl4.flowi4_mark = skb->mark;
1884
1885                 rcu_read_lock();
1886                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1887                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1888                 else
1889                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1890                                         RT_SCOPE_UNIVERSE);
1891                 rcu_read_unlock();
1892         }
1893         memcpy(addr, &src, 4);
1894 }
1895
1896 #ifdef CONFIG_IP_ROUTE_CLASSID
1897 static void set_class_tag(struct rtable *rt, u32 tag)
1898 {
1899         if (!(rt->dst.tclassid & 0xFFFF))
1900                 rt->dst.tclassid |= tag & 0xFFFF;
1901         if (!(rt->dst.tclassid & 0xFFFF0000))
1902                 rt->dst.tclassid |= tag & 0xFFFF0000;
1903 }
1904 #endif
1905
1906 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1907 {
1908         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1909
1910         if (advmss == 0) {
1911                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1912                                ip_rt_min_advmss);
1913                 if (advmss > 65535 - 40)
1914                         advmss = 65535 - 40;
1915         }
1916         return advmss;
1917 }
1918
1919 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1920 {
1921         const struct rtable *rt = (const struct rtable *) dst;
1922         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1923
1924         if (mtu && rt_is_output_route(rt))
1925                 return mtu;
1926
1927         mtu = dst->dev->mtu;
1928
1929         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1930
1931                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1932                         mtu = 576;
1933         }
1934
1935         if (mtu > IP_MAX_MTU)
1936                 mtu = IP_MAX_MTU;
1937
1938         return mtu;
1939 }
1940
1941 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1942                             struct fib_info *fi)
1943 {
1944         struct inet_peer *peer;
1945         int create = 0;
1946
1947         /* If a peer entry exists for this destination, we must hook
1948          * it up in order to get at cached metrics.
1949          */
1950         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1951                 create = 1;
1952
1953         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1954         if (peer) {
1955                 rt->rt_peer_genid = rt_peer_genid();
1956                 if (inet_metrics_new(peer))
1957                         memcpy(peer->metrics, fi->fib_metrics,
1958                                sizeof(u32) * RTAX_MAX);
1959                 dst_init_metrics(&rt->dst, peer->metrics, false);
1960
1961                 check_peer_pmtu(&rt->dst, peer);
1962                 if (peer->redirect_genid != redirect_genid)
1963                         peer->redirect_learned.a4 = 0;
1964                 if (peer->redirect_learned.a4 &&
1965                     peer->redirect_learned.a4 != rt->rt_gateway) {
1966                         rt->rt_gateway = peer->redirect_learned.a4;
1967                         rt->rt_flags |= RTCF_REDIRECTED;
1968                 }
1969         } else {
1970                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1971                         rt->fi = fi;
1972                         atomic_inc(&fi->fib_clntref);
1973                 }
1974                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1975         }
1976 }
1977
1978 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1979                            const struct fib_result *res,
1980                            struct fib_info *fi, u16 type, u32 itag)
1981 {
1982         struct dst_entry *dst = &rt->dst;
1983
1984         if (fi) {
1985                 if (FIB_RES_GW(*res) &&
1986                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1987                         rt->rt_gateway = FIB_RES_GW(*res);
1988                 rt_init_metrics(rt, fl4, fi);
1989 #ifdef CONFIG_IP_ROUTE_CLASSID
1990                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1991 #endif
1992         }
1993
1994         if (dst_mtu(dst) > IP_MAX_MTU)
1995                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1996         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1997                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1998
1999 #ifdef CONFIG_IP_ROUTE_CLASSID
2000 #ifdef CONFIG_IP_MULTIPLE_TABLES
2001         set_class_tag(rt, fib_rules_tclass(res));
2002 #endif
2003         set_class_tag(rt, itag);
2004 #endif
2005 }
2006
2007 static struct rtable *rt_dst_alloc(struct net_device *dev,
2008                                    bool nopolicy, bool noxfrm)
2009 {
2010         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2011                          DST_HOST |
2012                          (nopolicy ? DST_NOPOLICY : 0) |
2013                          (noxfrm ? DST_NOXFRM : 0));
2014 }
2015
2016 /* called in rcu_read_lock() section */
2017 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2018                                 u8 tos, struct net_device *dev, int our)
2019 {
2020         unsigned int hash;
2021         struct rtable *rth;
2022         __be32 spec_dst;
2023         struct in_device *in_dev = __in_dev_get_rcu(dev);
2024         u32 itag = 0;
2025         int err;
2026
2027         /* Primary sanity checks. */
2028
2029         if (in_dev == NULL)
2030                 return -EINVAL;
2031
2032         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2033             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2034                 goto e_inval;
2035
2036         if (ipv4_is_zeronet(saddr)) {
2037                 if (!ipv4_is_local_multicast(daddr))
2038                         goto e_inval;
2039                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2040         } else {
2041                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2042                                           &itag);
2043                 if (err < 0)
2044                         goto e_err;
2045         }
2046         rth = rt_dst_alloc(init_net.loopback_dev,
2047                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2048         if (!rth)
2049                 goto e_nobufs;
2050
2051 #ifdef CONFIG_IP_ROUTE_CLASSID
2052         rth->dst.tclassid = itag;
2053 #endif
2054         rth->dst.output = ip_rt_bug;
2055
2056         rth->rt_key_dst = daddr;
2057         rth->rt_key_src = saddr;
2058         rth->rt_genid   = rt_genid(dev_net(dev));
2059         rth->rt_flags   = RTCF_MULTICAST;
2060         rth->rt_type    = RTN_MULTICAST;
2061         rth->rt_key_tos = tos;
2062         rth->rt_dst     = daddr;
2063         rth->rt_src     = saddr;
2064         rth->rt_route_iif = dev->ifindex;
2065         rth->rt_iif     = dev->ifindex;
2066         rth->rt_oif     = 0;
2067         rth->rt_mark    = skb->mark;
2068         rth->rt_gateway = daddr;
2069         rth->rt_spec_dst= spec_dst;
2070         rth->rt_peer_genid = 0;
2071         rth->peer = NULL;
2072         rth->fi = NULL;
2073         if (our) {
2074                 rth->dst.input= ip_local_deliver;
2075                 rth->rt_flags |= RTCF_LOCAL;
2076         }
2077
2078 #ifdef CONFIG_IP_MROUTE
2079         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2080                 rth->dst.input = ip_mr_input;
2081 #endif
2082         RT_CACHE_STAT_INC(in_slow_mc);
2083
2084         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2085         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2086         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2087
2088 e_nobufs:
2089         return -ENOBUFS;
2090 e_inval:
2091         return -EINVAL;
2092 e_err:
2093         return err;
2094 }
2095
2096
2097 static void ip_handle_martian_source(struct net_device *dev,
2098                                      struct in_device *in_dev,
2099                                      struct sk_buff *skb,
2100                                      __be32 daddr,
2101                                      __be32 saddr)
2102 {
2103         RT_CACHE_STAT_INC(in_martian_src);
2104 #ifdef CONFIG_IP_ROUTE_VERBOSE
2105         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2106                 /*
2107                  *      RFC1812 recommendation, if source is martian,
2108                  *      the only hint is MAC header.
2109                  */
2110                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2111                         &daddr, &saddr, dev->name);
2112                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2113                         int i;
2114                         const unsigned char *p = skb_mac_header(skb);
2115                         printk(KERN_WARNING "ll header: ");
2116                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2117                                 printk("%02x", *p);
2118                                 if (i < (dev->hard_header_len - 1))
2119                                         printk(":");
2120                         }
2121                         printk("\n");
2122                 }
2123         }
2124 #endif
2125 }
2126
2127 /* called in rcu_read_lock() section */
2128 static int __mkroute_input(struct sk_buff *skb,
2129                            const struct fib_result *res,
2130                            struct in_device *in_dev,
2131                            __be32 daddr, __be32 saddr, u32 tos,
2132                            struct rtable **result)
2133 {
2134         struct rtable *rth;
2135         int err;
2136         struct in_device *out_dev;
2137         unsigned int flags = 0;
2138         __be32 spec_dst;
2139         u32 itag;
2140
2141         /* get a working reference to the output device */
2142         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2143         if (out_dev == NULL) {
2144                 if (net_ratelimit())
2145                         printk(KERN_CRIT "Bug in ip_route_input" \
2146                                "_slow(). Please, report\n");
2147                 return -EINVAL;
2148         }
2149
2150
2151         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2152                                   in_dev->dev, &spec_dst, &itag);
2153         if (err < 0) {
2154                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2155                                          saddr);
2156
2157                 goto cleanup;
2158         }
2159
2160         if (err)
2161                 flags |= RTCF_DIRECTSRC;
2162
2163         if (out_dev == in_dev && err &&
2164             (IN_DEV_SHARED_MEDIA(out_dev) ||
2165              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2166                 flags |= RTCF_DOREDIRECT;
2167
2168         if (skb->protocol != htons(ETH_P_IP)) {
2169                 /* Not IP (i.e. ARP). Do not create route, if it is
2170                  * invalid for proxy arp. DNAT routes are always valid.
2171                  *
2172                  * Proxy arp feature have been extended to allow, ARP
2173                  * replies back to the same interface, to support
2174                  * Private VLAN switch technologies. See arp.c.
2175                  */
2176                 if (out_dev == in_dev &&
2177                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2178                         err = -EINVAL;
2179                         goto cleanup;
2180                 }
2181         }
2182
2183         rth = rt_dst_alloc(out_dev->dev,
2184                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2185                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2186         if (!rth) {
2187                 err = -ENOBUFS;
2188                 goto cleanup;
2189         }
2190
2191         rth->rt_key_dst = daddr;
2192         rth->rt_key_src = saddr;
2193         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2194         rth->rt_flags = flags;
2195         rth->rt_type = res->type;
2196         rth->rt_key_tos = tos;
2197         rth->rt_dst     = daddr;
2198         rth->rt_src     = saddr;
2199         rth->rt_route_iif = in_dev->dev->ifindex;
2200         rth->rt_iif     = in_dev->dev->ifindex;
2201         rth->rt_oif     = 0;
2202         rth->rt_mark    = skb->mark;
2203         rth->rt_gateway = daddr;
2204         rth->rt_spec_dst= spec_dst;
2205         rth->rt_peer_genid = 0;
2206         rth->peer = NULL;
2207         rth->fi = NULL;
2208
2209         rth->dst.input = ip_forward;
2210         rth->dst.output = ip_output;
2211
2212         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2213
2214         *result = rth;
2215         err = 0;
2216  cleanup:
2217         return err;
2218 }
2219
2220 static int ip_mkroute_input(struct sk_buff *skb,
2221                             struct fib_result *res,
2222                             const struct flowi4 *fl4,
2223                             struct in_device *in_dev,
2224                             __be32 daddr, __be32 saddr, u32 tos)
2225 {
2226         struct rtable* rth = NULL;
2227         int err;
2228         unsigned hash;
2229
2230 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2231         if (res->fi && res->fi->fib_nhs > 1)
2232                 fib_select_multipath(res);
2233 #endif
2234
2235         /* create a routing cache entry */
2236         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2237         if (err)
2238                 return err;
2239
2240         /* put it into the cache */
2241         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2242                        rt_genid(dev_net(rth->dst.dev)));
2243         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2244         if (IS_ERR(rth))
2245                 return PTR_ERR(rth);
2246         return 0;
2247 }
2248
2249 /*
2250  *      NOTE. We drop all the packets that has local source
2251  *      addresses, because every properly looped back packet
2252  *      must have correct destination already attached by output routine.
2253  *
2254  *      Such approach solves two big problems:
2255  *      1. Not simplex devices are handled properly.
2256  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2257  *      called with rcu_read_lock()
2258  */
2259
2260 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2261                                u8 tos, struct net_device *dev)
2262 {
2263         struct fib_result res;
2264         struct in_device *in_dev = __in_dev_get_rcu(dev);
2265         struct flowi4   fl4;
2266         unsigned        flags = 0;
2267         u32             itag = 0;
2268         struct rtable * rth;
2269         unsigned        hash;
2270         __be32          spec_dst;
2271         int             err = -EINVAL;
2272         struct net    * net = dev_net(dev);
2273
2274         /* IP on this device is disabled. */
2275
2276         if (!in_dev)
2277                 goto out;
2278
2279         /* Check for the most weird martians, which can be not detected
2280            by fib_lookup.
2281          */
2282
2283         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2284             ipv4_is_loopback(saddr))
2285                 goto martian_source;
2286
2287         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2288                 goto brd_input;
2289
2290         /* Accept zero addresses only to limited broadcast;
2291          * I even do not know to fix it or not. Waiting for complains :-)
2292          */
2293         if (ipv4_is_zeronet(saddr))
2294                 goto martian_source;
2295
2296         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2297                 goto martian_destination;
2298
2299         /*
2300          *      Now we are ready to route packet.
2301          */
2302         fl4.flowi4_oif = 0;
2303         fl4.flowi4_iif = dev->ifindex;
2304         fl4.flowi4_mark = skb->mark;
2305         fl4.flowi4_tos = tos;
2306         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2307         fl4.daddr = daddr;
2308         fl4.saddr = saddr;
2309         err = fib_lookup(net, &fl4, &res);
2310         if (err != 0) {
2311                 if (!IN_DEV_FORWARD(in_dev))
2312                         goto e_hostunreach;
2313                 goto no_route;
2314         }
2315
2316         RT_CACHE_STAT_INC(in_slow_tot);
2317
2318         if (res.type == RTN_BROADCAST)
2319                 goto brd_input;
2320
2321         if (res.type == RTN_LOCAL) {
2322                 err = fib_validate_source(skb, saddr, daddr, tos,
2323                                           net->loopback_dev->ifindex,
2324                                           dev, &spec_dst, &itag);
2325                 if (err < 0)
2326                         goto martian_source_keep_err;
2327                 if (err)
2328                         flags |= RTCF_DIRECTSRC;
2329                 spec_dst = daddr;
2330                 goto local_input;
2331         }
2332
2333         if (!IN_DEV_FORWARD(in_dev))
2334                 goto e_hostunreach;
2335         if (res.type != RTN_UNICAST)
2336                 goto martian_destination;
2337
2338         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2339 out:    return err;
2340
2341 brd_input:
2342         if (skb->protocol != htons(ETH_P_IP))
2343                 goto e_inval;
2344
2345         if (ipv4_is_zeronet(saddr))
2346                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2347         else {
2348                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2349                                           &itag);
2350                 if (err < 0)
2351                         goto martian_source_keep_err;
2352                 if (err)
2353                         flags |= RTCF_DIRECTSRC;
2354         }
2355         flags |= RTCF_BROADCAST;
2356         res.type = RTN_BROADCAST;
2357         RT_CACHE_STAT_INC(in_brd);
2358
2359 local_input:
2360         rth = rt_dst_alloc(net->loopback_dev,
2361                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2362         if (!rth)
2363                 goto e_nobufs;
2364
2365         rth->dst.input= ip_local_deliver;
2366         rth->dst.output= ip_rt_bug;
2367 #ifdef CONFIG_IP_ROUTE_CLASSID
2368         rth->dst.tclassid = itag;
2369 #endif
2370
2371         rth->rt_key_dst = daddr;
2372         rth->rt_key_src = saddr;
2373         rth->rt_genid = rt_genid(net);
2374         rth->rt_flags   = flags|RTCF_LOCAL;
2375         rth->rt_type    = res.type;
2376         rth->rt_key_tos = tos;
2377         rth->rt_dst     = daddr;
2378         rth->rt_src     = saddr;
2379 #ifdef CONFIG_IP_ROUTE_CLASSID
2380         rth->dst.tclassid = itag;
2381 #endif
2382         rth->rt_route_iif = dev->ifindex;
2383         rth->rt_iif     = dev->ifindex;
2384         rth->rt_oif     = 0;
2385         rth->rt_mark    = skb->mark;
2386         rth->rt_gateway = daddr;
2387         rth->rt_spec_dst= spec_dst;
2388         rth->rt_peer_genid = 0;
2389         rth->peer = NULL;
2390         rth->fi = NULL;
2391         if (res.type == RTN_UNREACHABLE) {
2392                 rth->dst.input= ip_error;
2393                 rth->dst.error= -err;
2394                 rth->rt_flags   &= ~RTCF_LOCAL;
2395         }
2396         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2397         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2398         err = 0;
2399         if (IS_ERR(rth))
2400                 err = PTR_ERR(rth);
2401         goto out;
2402
2403 no_route:
2404         RT_CACHE_STAT_INC(in_no_route);
2405         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2406         res.type = RTN_UNREACHABLE;
2407         if (err == -ESRCH)
2408                 err = -ENETUNREACH;
2409         goto local_input;
2410
2411         /*
2412          *      Do not cache martian addresses: they should be logged (RFC1812)
2413          */
2414 martian_destination:
2415         RT_CACHE_STAT_INC(in_martian_dst);
2416 #ifdef CONFIG_IP_ROUTE_VERBOSE
2417         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2418                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2419                         &daddr, &saddr, dev->name);
2420 #endif
2421
2422 e_hostunreach:
2423         err = -EHOSTUNREACH;
2424         goto out;
2425
2426 e_inval:
2427         err = -EINVAL;
2428         goto out;
2429
2430 e_nobufs:
2431         err = -ENOBUFS;
2432         goto out;
2433
2434 martian_source:
2435         err = -EINVAL;
2436 martian_source_keep_err:
2437         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2438         goto out;
2439 }
2440
2441 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2442                            u8 tos, struct net_device *dev, bool noref)
2443 {
2444         struct rtable * rth;
2445         unsigned        hash;
2446         int iif = dev->ifindex;
2447         struct net *net;
2448         int res;
2449
2450         net = dev_net(dev);
2451
2452         rcu_read_lock();
2453
2454         if (!rt_caching(net))
2455                 goto skip_cache;
2456
2457         tos &= IPTOS_RT_MASK;
2458         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2459
2460         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2461              rth = rcu_dereference(rth->dst.rt_next)) {
2462                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2463                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2464                      (rth->rt_route_iif ^ iif) |
2465                      (rth->rt_key_tos ^ tos)) == 0 &&
2466                     rth->rt_mark == skb->mark &&
2467                     net_eq(dev_net(rth->dst.dev), net) &&
2468                     !rt_is_expired(rth)) {
2469                         ipv4_validate_peer(rth);
2470                         if (noref) {
2471                                 dst_use_noref(&rth->dst, jiffies);
2472                                 skb_dst_set_noref(skb, &rth->dst);
2473                         } else {
2474                                 dst_use(&rth->dst, jiffies);
2475                                 skb_dst_set(skb, &rth->dst);
2476                         }
2477                         RT_CACHE_STAT_INC(in_hit);
2478                         rcu_read_unlock();
2479                         return 0;
2480                 }
2481                 RT_CACHE_STAT_INC(in_hlist_search);
2482         }
2483
2484 skip_cache:
2485         /* Multicast recognition logic is moved from route cache to here.
2486            The problem was that too many Ethernet cards have broken/missing
2487            hardware multicast filters :-( As result the host on multicasting
2488            network acquires a lot of useless route cache entries, sort of
2489            SDR messages from all the world. Now we try to get rid of them.
2490            Really, provided software IP multicast filter is organized
2491            reasonably (at least, hashed), it does not result in a slowdown
2492            comparing with route cache reject entries.
2493            Note, that multicast routers are not affected, because
2494            route cache entry is created eventually.
2495          */
2496         if (ipv4_is_multicast(daddr)) {
2497                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2498
2499                 if (in_dev) {
2500                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2501                                                   ip_hdr(skb)->protocol);
2502                         if (our
2503 #ifdef CONFIG_IP_MROUTE
2504                                 ||
2505                             (!ipv4_is_local_multicast(daddr) &&
2506                              IN_DEV_MFORWARD(in_dev))
2507 #endif
2508                            ) {
2509                                 int res = ip_route_input_mc(skb, daddr, saddr,
2510                                                             tos, dev, our);
2511                                 rcu_read_unlock();
2512                                 return res;
2513                         }
2514                 }
2515                 rcu_read_unlock();
2516                 return -EINVAL;
2517         }
2518         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2519         rcu_read_unlock();
2520         return res;
2521 }
2522 EXPORT_SYMBOL(ip_route_input_common);
2523
2524 /* called with rcu_read_lock() */
2525 static struct rtable *__mkroute_output(const struct fib_result *res,
2526                                        const struct flowi4 *fl4,
2527                                        __be32 orig_daddr, __be32 orig_saddr,
2528                                        int orig_oif, __u8 orig_rtos,
2529                                        struct net_device *dev_out,
2530                                        unsigned int flags)
2531 {
2532         struct fib_info *fi = res->fi;
2533         struct in_device *in_dev;
2534         u16 type = res->type;
2535         struct rtable *rth;
2536
2537         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2538                 return ERR_PTR(-EINVAL);
2539
2540         if (ipv4_is_lbcast(fl4->daddr))
2541                 type = RTN_BROADCAST;
2542         else if (ipv4_is_multicast(fl4->daddr))
2543                 type = RTN_MULTICAST;
2544         else if (ipv4_is_zeronet(fl4->daddr))
2545                 return ERR_PTR(-EINVAL);
2546
2547         if (dev_out->flags & IFF_LOOPBACK)
2548                 flags |= RTCF_LOCAL;
2549
2550         in_dev = __in_dev_get_rcu(dev_out);
2551         if (!in_dev)
2552                 return ERR_PTR(-EINVAL);
2553
2554         if (type == RTN_BROADCAST) {
2555                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2556                 fi = NULL;
2557         } else if (type == RTN_MULTICAST) {
2558                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2559                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2560                                      fl4->flowi4_proto))
2561                         flags &= ~RTCF_LOCAL;
2562                 /* If multicast route do not exist use
2563                  * default one, but do not gateway in this case.
2564                  * Yes, it is hack.
2565                  */
2566                 if (fi && res->prefixlen < 4)
2567                         fi = NULL;
2568         }
2569
2570         rth = rt_dst_alloc(dev_out,
2571                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2572                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2573         if (!rth)
2574                 return ERR_PTR(-ENOBUFS);
2575
2576         rth->dst.output = ip_output;
2577
2578         rth->rt_key_dst = orig_daddr;
2579         rth->rt_key_src = orig_saddr;
2580         rth->rt_genid = rt_genid(dev_net(dev_out));
2581         rth->rt_flags   = flags;
2582         rth->rt_type    = type;
2583         rth->rt_key_tos = orig_rtos;
2584         rth->rt_dst     = fl4->daddr;
2585         rth->rt_src     = fl4->saddr;
2586         rth->rt_route_iif = 0;
2587         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2588         rth->rt_oif     = orig_oif;
2589         rth->rt_mark    = fl4->flowi4_mark;
2590         rth->rt_gateway = fl4->daddr;
2591         rth->rt_spec_dst= fl4->saddr;
2592         rth->rt_peer_genid = 0;
2593         rth->peer = NULL;
2594         rth->fi = NULL;
2595
2596         RT_CACHE_STAT_INC(out_slow_tot);
2597
2598         if (flags & RTCF_LOCAL) {
2599                 rth->dst.input = ip_local_deliver;
2600                 rth->rt_spec_dst = fl4->daddr;
2601         }
2602         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2603                 rth->rt_spec_dst = fl4->saddr;
2604                 if (flags & RTCF_LOCAL &&
2605                     !(dev_out->flags & IFF_LOOPBACK)) {
2606                         rth->dst.output = ip_mc_output;
2607                         RT_CACHE_STAT_INC(out_slow_mc);
2608                 }
2609 #ifdef CONFIG_IP_MROUTE
2610                 if (type == RTN_MULTICAST) {
2611                         if (IN_DEV_MFORWARD(in_dev) &&
2612                             !ipv4_is_local_multicast(fl4->daddr)) {
2613                                 rth->dst.input = ip_mr_input;
2614                                 rth->dst.output = ip_mc_output;
2615                         }
2616                 }
2617 #endif
2618         }
2619
2620         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2621
2622         return rth;
2623 }
2624
2625 /*
2626  * Major route resolver routine.
2627  * called with rcu_read_lock();
2628  */
2629
2630 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2631 {
2632         struct net_device *dev_out = NULL;
2633         __u8 tos = RT_FL_TOS(fl4);
2634         unsigned int flags = 0;
2635         struct fib_result res;
2636         struct rtable *rth;
2637         __be32 orig_daddr;
2638         __be32 orig_saddr;
2639         int orig_oif;
2640
2641         res.fi          = NULL;
2642 #ifdef CONFIG_IP_MULTIPLE_TABLES
2643         res.r           = NULL;
2644 #endif
2645
2646         orig_daddr = fl4->daddr;
2647         orig_saddr = fl4->saddr;
2648         orig_oif = fl4->flowi4_oif;
2649
2650         fl4->flowi4_iif = net->loopback_dev->ifindex;
2651         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2652         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2653                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2654
2655         rcu_read_lock();
2656         if (fl4->saddr) {
2657                 rth = ERR_PTR(-EINVAL);
2658                 if (ipv4_is_multicast(fl4->saddr) ||
2659                     ipv4_is_lbcast(fl4->saddr) ||
2660                     ipv4_is_zeronet(fl4->saddr))
2661                         goto out;
2662
2663                 /* I removed check for oif == dev_out->oif here.
2664                    It was wrong for two reasons:
2665                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2666                       is assigned to multiple interfaces.
2667                    2. Moreover, we are allowed to send packets with saddr
2668                       of another iface. --ANK
2669                  */
2670
2671                 if (fl4->flowi4_oif == 0 &&
2672                     (ipv4_is_multicast(fl4->daddr) ||
2673                      ipv4_is_lbcast(fl4->daddr))) {
2674                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2675                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2676                         if (dev_out == NULL)
2677                                 goto out;
2678
2679                         /* Special hack: user can direct multicasts
2680                            and limited broadcast via necessary interface
2681                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2682                            This hack is not just for fun, it allows
2683                            vic,vat and friends to work.
2684                            They bind socket to loopback, set ttl to zero
2685                            and expect that it will work.
2686                            From the viewpoint of routing cache they are broken,
2687                            because we are not allowed to build multicast path
2688                            with loopback source addr (look, routing cache
2689                            cannot know, that ttl is zero, so that packet
2690                            will not leave this host and route is valid).
2691                            Luckily, this hack is good workaround.
2692                          */
2693
2694                         fl4->flowi4_oif = dev_out->ifindex;
2695                         goto make_route;
2696                 }
2697
2698                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2699                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2700                         if (!__ip_dev_find(net, fl4->saddr, false))
2701                                 goto out;
2702                 }
2703         }
2704
2705
2706         if (fl4->flowi4_oif) {
2707                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2708                 rth = ERR_PTR(-ENODEV);
2709                 if (dev_out == NULL)
2710                         goto out;
2711
2712                 /* RACE: Check return value of inet_select_addr instead. */
2713                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2714                         rth = ERR_PTR(-ENETUNREACH);
2715                         goto out;
2716                 }
2717                 if (ipv4_is_local_multicast(fl4->daddr) ||
2718                     ipv4_is_lbcast(fl4->daddr)) {
2719                         if (!fl4->saddr)
2720                                 fl4->saddr = inet_select_addr(dev_out, 0,
2721                                                               RT_SCOPE_LINK);
2722                         goto make_route;
2723                 }
2724                 if (fl4->saddr) {
2725                         if (ipv4_is_multicast(fl4->daddr))
2726                                 fl4->saddr = inet_select_addr(dev_out, 0,
2727                                                               fl4->flowi4_scope);
2728                         else if (!fl4->daddr)
2729                                 fl4->saddr = inet_select_addr(dev_out, 0,
2730                                                               RT_SCOPE_HOST);
2731                 }
2732         }
2733
2734         if (!fl4->daddr) {
2735                 fl4->daddr = fl4->saddr;
2736                 if (!fl4->daddr)
2737                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2738                 dev_out = net->loopback_dev;
2739                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2740                 res.type = RTN_LOCAL;
2741                 flags |= RTCF_LOCAL;
2742                 goto make_route;
2743         }
2744
2745         if (fib_lookup(net, fl4, &res)) {
2746                 res.fi = NULL;
2747                 if (fl4->flowi4_oif) {
2748                         /* Apparently, routing tables are wrong. Assume,
2749                            that the destination is on link.
2750
2751                            WHY? DW.
2752                            Because we are allowed to send to iface
2753                            even if it has NO routes and NO assigned
2754                            addresses. When oif is specified, routing
2755                            tables are looked up with only one purpose:
2756                            to catch if destination is gatewayed, rather than
2757                            direct. Moreover, if MSG_DONTROUTE is set,
2758                            we send packet, ignoring both routing tables
2759                            and ifaddr state. --ANK
2760
2761
2762                            We could make it even if oif is unknown,
2763                            likely IPv6, but we do not.
2764                          */
2765
2766                         if (fl4->saddr == 0)
2767                                 fl4->saddr = inet_select_addr(dev_out, 0,
2768                                                               RT_SCOPE_LINK);
2769                         res.type = RTN_UNICAST;
2770                         goto make_route;
2771                 }
2772                 rth = ERR_PTR(-ENETUNREACH);
2773                 goto out;
2774         }
2775
2776         if (res.type == RTN_LOCAL) {
2777                 if (!fl4->saddr) {
2778                         if (res.fi->fib_prefsrc)
2779                                 fl4->saddr = res.fi->fib_prefsrc;
2780                         else
2781                                 fl4->saddr = fl4->daddr;
2782                 }
2783                 dev_out = net->loopback_dev;
2784                 fl4->flowi4_oif = dev_out->ifindex;
2785                 res.fi = NULL;
2786                 flags |= RTCF_LOCAL;
2787                 goto make_route;
2788         }
2789
2790 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2791         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2792                 fib_select_multipath(&res);
2793         else
2794 #endif
2795         if (!res.prefixlen &&
2796             res.table->tb_num_default > 1 &&
2797             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2798                 fib_select_default(&res);
2799
2800         if (!fl4->saddr)
2801                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2802
2803         dev_out = FIB_RES_DEV(res);
2804         fl4->flowi4_oif = dev_out->ifindex;
2805
2806
2807 make_route:
2808         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2809                                tos, dev_out, flags);
2810         if (!IS_ERR(rth)) {
2811                 unsigned int hash;
2812
2813                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2814                                rt_genid(dev_net(dev_out)));
2815                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2816         }
2817
2818 out:
2819         rcu_read_unlock();
2820         return rth;
2821 }
2822
2823 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2824 {
2825         struct rtable *rth;
2826         unsigned int hash;
2827
2828         if (!rt_caching(net))
2829                 goto slow_output;
2830
2831         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2832
2833         rcu_read_lock_bh();
2834         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2835                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2836                 if (rth->rt_key_dst == flp4->daddr &&
2837                     rth->rt_key_src == flp4->saddr &&
2838                     rt_is_output_route(rth) &&
2839                     rth->rt_oif == flp4->flowi4_oif &&
2840                     rth->rt_mark == flp4->flowi4_mark &&
2841                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2842                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2843                     net_eq(dev_net(rth->dst.dev), net) &&
2844                     !rt_is_expired(rth)) {
2845                         ipv4_validate_peer(rth);
2846                         dst_use(&rth->dst, jiffies);
2847                         RT_CACHE_STAT_INC(out_hit);
2848                         rcu_read_unlock_bh();
2849                         if (!flp4->saddr)
2850                                 flp4->saddr = rth->rt_src;
2851                         if (!flp4->daddr)
2852                                 flp4->daddr = rth->rt_dst;
2853                         return rth;
2854                 }
2855                 RT_CACHE_STAT_INC(out_hlist_search);
2856         }
2857         rcu_read_unlock_bh();
2858
2859 slow_output:
2860         return ip_route_output_slow(net, flp4);
2861 }
2862 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2863
2864 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2865 {
2866         return NULL;
2867 }
2868
2869 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2870 {
2871         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2872
2873         return mtu ? : dst->dev->mtu;
2874 }
2875
2876 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2877 {
2878 }
2879
2880 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2881                                           unsigned long old)
2882 {
2883         return NULL;
2884 }
2885
2886 static struct dst_ops ipv4_dst_blackhole_ops = {
2887         .family                 =       AF_INET,
2888         .protocol               =       cpu_to_be16(ETH_P_IP),
2889         .destroy                =       ipv4_dst_destroy,
2890         .check                  =       ipv4_blackhole_dst_check,
2891         .mtu                    =       ipv4_blackhole_mtu,
2892         .default_advmss         =       ipv4_default_advmss,
2893         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2894         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2895         .neigh_lookup           =       ipv4_neigh_lookup,
2896 };
2897
2898 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2899 {
2900         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2901         struct rtable *ort = (struct rtable *) dst_orig;
2902
2903         if (rt) {
2904                 struct dst_entry *new = &rt->dst;
2905
2906                 new->__use = 1;
2907                 new->input = dst_discard;
2908                 new->output = dst_discard;
2909                 dst_copy_metrics(new, &ort->dst);
2910
2911                 new->dev = ort->dst.dev;
2912                 if (new->dev)
2913                         dev_hold(new->dev);
2914
2915                 rt->rt_key_dst = ort->rt_key_dst;
2916                 rt->rt_key_src = ort->rt_key_src;
2917                 rt->rt_key_tos = ort->rt_key_tos;
2918                 rt->rt_route_iif = ort->rt_route_iif;
2919                 rt->rt_iif = ort->rt_iif;
2920                 rt->rt_oif = ort->rt_oif;
2921                 rt->rt_mark = ort->rt_mark;
2922
2923                 rt->rt_genid = rt_genid(net);
2924                 rt->rt_flags = ort->rt_flags;
2925                 rt->rt_type = ort->rt_type;
2926                 rt->rt_dst = ort->rt_dst;
2927                 rt->rt_src = ort->rt_src;
2928                 rt->rt_gateway = ort->rt_gateway;
2929                 rt->rt_spec_dst = ort->rt_spec_dst;
2930                 rt->peer = ort->peer;
2931                 if (rt->peer)
2932                         atomic_inc(&rt->peer->refcnt);
2933                 rt->fi = ort->fi;
2934                 if (rt->fi)
2935                         atomic_inc(&rt->fi->fib_clntref);
2936
2937                 dst_free(new);
2938         }
2939
2940         dst_release(dst_orig);
2941
2942         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2943 }
2944
2945 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2946                                     struct sock *sk)
2947 {
2948         struct rtable *rt = __ip_route_output_key(net, flp4);
2949
2950         if (IS_ERR(rt))
2951                 return rt;
2952
2953         if (flp4->flowi4_proto)
2954                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2955                                                    flowi4_to_flowi(flp4),
2956                                                    sk, 0);
2957
2958         return rt;
2959 }
2960 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2961
2962 static int rt_fill_info(struct net *net,
2963                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2964                         int nowait, unsigned int flags)
2965 {
2966         struct rtable *rt = skb_rtable(skb);
2967         struct rtmsg *r;
2968         struct nlmsghdr *nlh;
2969         unsigned long expires = 0;
2970         const struct inet_peer *peer = rt->peer;
2971         u32 id = 0, ts = 0, tsage = 0, error;
2972
2973         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2974         if (nlh == NULL)
2975                 return -EMSGSIZE;
2976
2977         r = nlmsg_data(nlh);
2978         r->rtm_family    = AF_INET;
2979         r->rtm_dst_len  = 32;
2980         r->rtm_src_len  = 0;
2981         r->rtm_tos      = rt->rt_key_tos;
2982         r->rtm_table    = RT_TABLE_MAIN;
2983         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2984         r->rtm_type     = rt->rt_type;
2985         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2986         r->rtm_protocol = RTPROT_UNSPEC;
2987         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2988         if (rt->rt_flags & RTCF_NOTIFY)
2989                 r->rtm_flags |= RTM_F_NOTIFY;
2990
2991         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2992
2993         if (rt->rt_key_src) {
2994                 r->rtm_src_len = 32;
2995                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2996         }
2997         if (rt->dst.dev)
2998                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2999 #ifdef CONFIG_IP_ROUTE_CLASSID
3000         if (rt->dst.tclassid)
3001                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3002 #endif
3003         if (rt_is_input_route(rt))
3004                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3005         else if (rt->rt_src != rt->rt_key_src)
3006                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3007
3008         if (rt->rt_dst != rt->rt_gateway)
3009                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3010
3011         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3012                 goto nla_put_failure;
3013
3014         if (rt->rt_mark)
3015                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3016
3017         error = rt->dst.error;
3018         if (peer) {
3019                 inet_peer_refcheck(rt->peer);
3020                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3021                 if (peer->tcp_ts_stamp) {
3022                         ts = peer->tcp_ts;
3023                         tsage = get_seconds() - peer->tcp_ts_stamp;
3024                 }
3025                 expires = ACCESS_ONCE(peer->pmtu_expires);
3026                 if (expires) {
3027                         if (time_before(jiffies, expires))
3028                                 expires -= jiffies;
3029                         else
3030                                 expires = 0;
3031                 }
3032         }
3033
3034         if (rt_is_input_route(rt)) {
3035 #ifdef CONFIG_IP_MROUTE
3036                 __be32 dst = rt->rt_dst;
3037
3038                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3039                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3040                         int err = ipmr_get_route(net, skb,
3041                                                  rt->rt_src, rt->rt_dst,
3042                                                  r, nowait);
3043                         if (err <= 0) {
3044                                 if (!nowait) {
3045                                         if (err == 0)
3046                                                 return 0;
3047                                         goto nla_put_failure;
3048                                 } else {
3049                                         if (err == -EMSGSIZE)
3050                                                 goto nla_put_failure;
3051                                         error = err;
3052                                 }
3053                         }
3054                 } else
3055 #endif
3056                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3057         }
3058
3059         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3060                                expires, error) < 0)
3061                 goto nla_put_failure;
3062
3063         return nlmsg_end(skb, nlh);
3064
3065 nla_put_failure:
3066         nlmsg_cancel(skb, nlh);
3067         return -EMSGSIZE;
3068 }
3069
3070 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3071 {
3072         struct net *net = sock_net(in_skb->sk);
3073         struct rtmsg *rtm;
3074         struct nlattr *tb[RTA_MAX+1];
3075         struct rtable *rt = NULL;
3076         __be32 dst = 0;
3077         __be32 src = 0;
3078         u32 iif;
3079         int err;
3080         int mark;
3081         struct sk_buff *skb;
3082
3083         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3084         if (err < 0)
3085                 goto errout;
3086
3087         rtm = nlmsg_data(nlh);
3088
3089         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3090         if (skb == NULL) {
3091                 err = -ENOBUFS;
3092                 goto errout;
3093         }
3094
3095         /* Reserve room for dummy headers, this skb can pass
3096            through good chunk of routing engine.
3097          */
3098         skb_reset_mac_header(skb);
3099         skb_reset_network_header(skb);
3100
3101         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3102         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3103         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3104
3105         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3106         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3107         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3108         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3109
3110         if (iif) {
3111                 struct net_device *dev;
3112
3113                 dev = __dev_get_by_index(net, iif);
3114                 if (dev == NULL) {
3115                         err = -ENODEV;
3116                         goto errout_free;
3117                 }
3118
3119                 skb->protocol   = htons(ETH_P_IP);
3120                 skb->dev        = dev;
3121                 skb->mark       = mark;
3122                 local_bh_disable();
3123                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3124                 local_bh_enable();
3125
3126                 rt = skb_rtable(skb);
3127                 if (err == 0 && rt->dst.error)
3128                         err = -rt->dst.error;
3129         } else {
3130                 struct flowi4 fl4 = {
3131                         .daddr = dst,
3132                         .saddr = src,
3133                         .flowi4_tos = rtm->rtm_tos,
3134                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3135                         .flowi4_mark = mark,
3136                 };
3137                 rt = ip_route_output_key(net, &fl4);
3138
3139                 err = 0;
3140                 if (IS_ERR(rt))
3141                         err = PTR_ERR(rt);
3142         }
3143
3144         if (err)
3145                 goto errout_free;
3146
3147         skb_dst_set(skb, &rt->dst);
3148         if (rtm->rtm_flags & RTM_F_NOTIFY)
3149                 rt->rt_flags |= RTCF_NOTIFY;
3150
3151         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3152                            RTM_NEWROUTE, 0, 0);
3153         if (err <= 0)
3154                 goto errout_free;
3155
3156         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3157 errout:
3158         return err;
3159
3160 errout_free:
3161         kfree_skb(skb);
3162         goto errout;
3163 }
3164
3165 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3166 {
3167         struct rtable *rt;
3168         int h, s_h;
3169         int idx, s_idx;
3170         struct net *net;
3171
3172         net = sock_net(skb->sk);
3173
3174         s_h = cb->args[0];
3175         if (s_h < 0)
3176                 s_h = 0;
3177         s_idx = idx = cb->args[1];
3178         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3179                 if (!rt_hash_table[h].chain)
3180                         continue;
3181                 rcu_read_lock_bh();
3182                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3183                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3184                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3185                                 continue;
3186                         if (rt_is_expired(rt))
3187                                 continue;
3188                         skb_dst_set_noref(skb, &rt->dst);
3189                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3190                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3191                                          1, NLM_F_MULTI) <= 0) {
3192                                 skb_dst_drop(skb);
3193                                 rcu_read_unlock_bh();
3194                                 goto done;
3195                         }
3196                         skb_dst_drop(skb);
3197                 }
3198                 rcu_read_unlock_bh();
3199         }
3200
3201 done:
3202         cb->args[0] = h;
3203         cb->args[1] = idx;
3204         return skb->len;
3205 }
3206
3207 void ip_rt_multicast_event(struct in_device *in_dev)
3208 {
3209         rt_cache_flush(dev_net(in_dev->dev), 0);
3210 }
3211
3212 #ifdef CONFIG_SYSCTL
3213 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3214                                         void __user *buffer,
3215                                         size_t *lenp, loff_t *ppos)
3216 {
3217         if (write) {
3218                 int flush_delay;
3219                 ctl_table ctl;
3220                 struct net *net;
3221
3222                 memcpy(&ctl, __ctl, sizeof(ctl));
3223                 ctl.data = &flush_delay;
3224                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3225
3226                 net = (struct net *)__ctl->extra1;
3227                 rt_cache_flush(net, flush_delay);
3228                 return 0;
3229         }
3230
3231         return -EINVAL;
3232 }
3233
3234 static ctl_table ipv4_route_table[] = {
3235         {
3236                 .procname       = "gc_thresh",
3237                 .data           = &ipv4_dst_ops.gc_thresh,
3238                 .maxlen         = sizeof(int),
3239                 .mode           = 0644,
3240                 .proc_handler   = proc_dointvec,
3241         },
3242         {
3243                 .procname       = "max_size",
3244                 .data           = &ip_rt_max_size,
3245                 .maxlen         = sizeof(int),
3246                 .mode           = 0644,
3247                 .proc_handler   = proc_dointvec,
3248         },
3249         {
3250                 /*  Deprecated. Use gc_min_interval_ms */
3251
3252                 .procname       = "gc_min_interval",
3253                 .data           = &ip_rt_gc_min_interval,
3254                 .maxlen         = sizeof(int),
3255                 .mode           = 0644,
3256                 .proc_handler   = proc_dointvec_jiffies,
3257         },
3258         {
3259                 .procname       = "gc_min_interval_ms",
3260                 .data           = &ip_rt_gc_min_interval,
3261                 .maxlen         = sizeof(int),
3262                 .mode           = 0644,
3263                 .proc_handler   = proc_dointvec_ms_jiffies,
3264         },
3265         {
3266                 .procname       = "gc_timeout",
3267                 .data           = &ip_rt_gc_timeout,
3268                 .maxlen         = sizeof(int),
3269                 .mode           = 0644,
3270                 .proc_handler   = proc_dointvec_jiffies,
3271         },
3272         {
3273                 .procname       = "gc_interval",
3274                 .data           = &ip_rt_gc_interval,
3275                 .maxlen         = sizeof(int),
3276                 .mode           = 0644,
3277                 .proc_handler   = proc_dointvec_jiffies,
3278         },
3279         {
3280                 .procname       = "redirect_load",
3281                 .data           = &ip_rt_redirect_load,
3282                 .maxlen         = sizeof(int),
3283                 .mode           = 0644,
3284                 .proc_handler   = proc_dointvec,
3285         },
3286         {
3287                 .procname       = "redirect_number",
3288                 .data           = &ip_rt_redirect_number,
3289                 .maxlen         = sizeof(int),
3290                 .mode           = 0644,
3291                 .proc_handler   = proc_dointvec,
3292         },
3293         {
3294                 .procname       = "redirect_silence",
3295                 .data           = &ip_rt_redirect_silence,
3296                 .maxlen         = sizeof(int),
3297                 .mode           = 0644,
3298                 .proc_handler   = proc_dointvec,
3299         },
3300         {
3301                 .procname       = "error_cost",
3302                 .data           = &ip_rt_error_cost,
3303                 .maxlen         = sizeof(int),
3304                 .mode           = 0644,
3305                 .proc_handler   = proc_dointvec,
3306         },
3307         {
3308                 .procname       = "error_burst",
3309                 .data           = &ip_rt_error_burst,
3310                 .maxlen         = sizeof(int),
3311                 .mode           = 0644,
3312                 .proc_handler   = proc_dointvec,
3313         },
3314         {
3315                 .procname       = "gc_elasticity",
3316                 .data           = &ip_rt_gc_elasticity,
3317                 .maxlen         = sizeof(int),
3318                 .mode           = 0644,
3319                 .proc_handler   = proc_dointvec,
3320         },
3321         {
3322                 .procname       = "mtu_expires",
3323                 .data           = &ip_rt_mtu_expires,
3324                 .maxlen         = sizeof(int),
3325                 .mode           = 0644,
3326                 .proc_handler   = proc_dointvec_jiffies,
3327         },
3328         {
3329                 .procname       = "min_pmtu",
3330                 .data           = &ip_rt_min_pmtu,
3331                 .maxlen         = sizeof(int),
3332                 .mode           = 0644,
3333                 .proc_handler   = proc_dointvec,
3334         },
3335         {
3336                 .procname       = "min_adv_mss",
3337                 .data           = &ip_rt_min_advmss,
3338                 .maxlen         = sizeof(int),
3339                 .mode           = 0644,
3340                 .proc_handler   = proc_dointvec,
3341         },
3342         { }
3343 };
3344
3345 static struct ctl_table empty[1];
3346
3347 static struct ctl_table ipv4_skeleton[] =
3348 {
3349         { .procname = "route",
3350           .mode = 0555, .child = ipv4_route_table},
3351         { .procname = "neigh",
3352           .mode = 0555, .child = empty},
3353         { }
3354 };
3355
3356 static __net_initdata struct ctl_path ipv4_path[] = {
3357         { .procname = "net", },
3358         { .procname = "ipv4", },
3359         { },
3360 };
3361
3362 static struct ctl_table ipv4_route_flush_table[] = {
3363         {
3364                 .procname       = "flush",
3365                 .maxlen         = sizeof(int),
3366                 .mode           = 0200,
3367                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3368         },
3369         { },
3370 };
3371
3372 static __net_initdata struct ctl_path ipv4_route_path[] = {
3373         { .procname = "net", },
3374         { .procname = "ipv4", },
3375         { .procname = "route", },
3376         { },
3377 };
3378
3379 static __net_init int sysctl_route_net_init(struct net *net)
3380 {
3381         struct ctl_table *tbl;
3382
3383         tbl = ipv4_route_flush_table;
3384         if (!net_eq(net, &init_net)) {
3385                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3386                 if (tbl == NULL)
3387                         goto err_dup;
3388         }
3389         tbl[0].extra1 = net;
3390
3391         net->ipv4.route_hdr =
3392                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3393         if (net->ipv4.route_hdr == NULL)
3394                 goto err_reg;
3395         return 0;
3396
3397 err_reg:
3398         if (tbl != ipv4_route_flush_table)
3399                 kfree(tbl);
3400 err_dup:
3401         return -ENOMEM;
3402 }
3403
3404 static __net_exit void sysctl_route_net_exit(struct net *net)
3405 {
3406         struct ctl_table *tbl;
3407
3408         tbl = net->ipv4.route_hdr->ctl_table_arg;
3409         unregister_net_sysctl_table(net->ipv4.route_hdr);
3410         BUG_ON(tbl == ipv4_route_flush_table);
3411         kfree(tbl);
3412 }
3413
3414 static __net_initdata struct pernet_operations sysctl_route_ops = {
3415         .init = sysctl_route_net_init,
3416         .exit = sysctl_route_net_exit,
3417 };
3418 #endif
3419
3420 static __net_init int rt_genid_init(struct net *net)
3421 {
3422         get_random_bytes(&net->ipv4.rt_genid,
3423                          sizeof(net->ipv4.rt_genid));
3424         get_random_bytes(&net->ipv4.dev_addr_genid,
3425                          sizeof(net->ipv4.dev_addr_genid));
3426         return 0;
3427 }
3428
3429 static __net_initdata struct pernet_operations rt_genid_ops = {
3430         .init = rt_genid_init,
3431 };
3432
3433
3434 #ifdef CONFIG_IP_ROUTE_CLASSID
3435 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3436 #endif /* CONFIG_IP_ROUTE_CLASSID */
3437
3438 static __initdata unsigned long rhash_entries;
3439 static int __init set_rhash_entries(char *str)
3440 {
3441         if (!str)
3442                 return 0;
3443         rhash_entries = simple_strtoul(str, &str, 0);
3444         return 1;
3445 }
3446 __setup("rhash_entries=", set_rhash_entries);
3447
3448 int __init ip_rt_init(void)
3449 {
3450         int rc = 0;
3451
3452 #ifdef CONFIG_IP_ROUTE_CLASSID
3453         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3454         if (!ip_rt_acct)
3455                 panic("IP: failed to allocate ip_rt_acct\n");
3456 #endif
3457
3458         ipv4_dst_ops.kmem_cachep =
3459                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3460                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3461
3462         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3463
3464         if (dst_entries_init(&ipv4_dst_ops) < 0)
3465                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3466
3467         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3468                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3469
3470         rt_hash_table = (struct rt_hash_bucket *)
3471                 alloc_large_system_hash("IP route cache",
3472                                         sizeof(struct rt_hash_bucket),
3473                                         rhash_entries,
3474                                         (totalram_pages >= 128 * 1024) ?
3475                                         15 : 17,
3476                                         0,
3477                                         &rt_hash_log,
3478                                         &rt_hash_mask,
3479                                         rhash_entries ? 0 : 512 * 1024);
3480         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3481         rt_hash_lock_init();
3482
3483         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3484         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3485
3486         devinet_init();
3487         ip_fib_init();
3488
3489         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3490         expires_ljiffies = jiffies;
3491         schedule_delayed_work(&expires_work,
3492                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3493
3494         if (ip_rt_proc_init())
3495                 printk(KERN_ERR "Unable to create route proc files\n");
3496 #ifdef CONFIG_XFRM
3497         xfrm_init();
3498         xfrm4_init(ip_rt_max_size);
3499 #endif
3500         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3501
3502 #ifdef CONFIG_SYSCTL
3503         register_pernet_subsys(&sysctl_route_ops);
3504 #endif
3505         register_pernet_subsys(&rt_genid_ops);
3506         return rc;
3507 }
3508
3509 #ifdef CONFIG_SYSCTL
3510 /*
3511  * We really need to sanitize the damn ipv4 init order, then all
3512  * this nonsense will go away.
3513  */
3514 void __init ip_static_sysctl_init(void)
3515 {
3516         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3517 }
3518 #endif