net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         peer = rt_get_peer_create(rt, rt->rt_dst);
 166         if (peer) {
 167                 u32 *old_p = __DST_METRICS_PTR(old);
 168                 unsigned long prev, new;
 169
 170                 p = peer->metrics;
 171                 if (inet_metrics_new(peer))
 172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 173
 174                 new = (unsigned long) p;
 175                 prev = cmpxchg(&dst->_metrics, old, new);
 176
 177                 if (prev != old) {
 178                         p = __DST_METRICS_PTR(prev);
 179                         if (prev & DST_METRICS_READ_ONLY)
 180                                 p = NULL;
 181                 } else {
 182                         if (rt->fi) {
 183                                 fib_info_put(rt->fi);
 184                                 rt->fi = NULL;
 185                         }
 186                 }
 187         }
 188         return p;
 189 }
 190
 191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 192
 193 static struct dst_ops ipv4_dst_ops = {
 194         .family =               AF_INET,
 195         .protocol =             cpu_to_be16(ETH_P_IP),
 196         .gc =                   rt_garbage_collect,
 197         .check =                ipv4_dst_check,
 198         .default_advmss =       ipv4_default_advmss,
 199         .mtu =                  ipv4_mtu,
 200         .cow_metrics =          ipv4_cow_metrics,
 201         .destroy =              ipv4_dst_destroy,
 202         .ifdown =               ipv4_dst_ifdown,
 203         .negative_advice =      ipv4_negative_advice,
 204         .link_failure =         ipv4_link_failure,
 205         .update_pmtu =          ip_rt_update_pmtu,
 206         .local_out =            __ip_local_out,
 207         .neigh_lookup =         ipv4_neigh_lookup,
 208 };
 209
 210 #define ECN_OR_COST(class)      TC_PRIO_##class
 211
 212 const __u8 ip_tos2prio[16] = {
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK)
 229 };
 230 EXPORT_SYMBOL(ip_tos2prio);
 231
 232 /*
 233  * Route cache.
 234  */
 235
 236 /* The locking scheme is rather straight forward:
 237  *
 238  * 1) Read-Copy Update protects the buckets of the central route hash.
 239  * 2) Only writers remove entries, and they hold the lock
 240  *    as they look at rtable reference counts.
 241  * 3) Only readers acquire references to rtable entries,
 242  *    they do so with atomic increments and with the
 243  *    lock held.
 244  */
 245
 246 struct rt_hash_bucket {
 247         struct rtable __rcu     *chain;
 248 };
 249
 250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 251         defined(CONFIG_PROVE_LOCKING)
 252 /*
 253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 254  * The size of this table is a power of two and depends on the number of CPUS.
 255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 256  */
 257 #ifdef CONFIG_LOCKDEP
 258 # define RT_HASH_LOCK_SZ        256
 259 #else
 260 # if NR_CPUS >= 32
 261 #  define RT_HASH_LOCK_SZ       4096
 262 # elif NR_CPUS >= 16
 263 #  define RT_HASH_LOCK_SZ       2048
 264 # elif NR_CPUS >= 8
 265 #  define RT_HASH_LOCK_SZ       1024
 266 # elif NR_CPUS >= 4
 267 #  define RT_HASH_LOCK_SZ       512
 268 # else
 269 #  define RT_HASH_LOCK_SZ       256
 270 # endif
 271 #endif
 272
 273 static spinlock_t       *rt_hash_locks;
 274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 275
 276 static __init void rt_hash_lock_init(void)
 277 {
 278         int i;
 279
 280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 281                         GFP_KERNEL);
 282         if (!rt_hash_locks)
 283                 panic("IP: failed to allocate rt_hash_locks\n");
 284
 285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 286                 spin_lock_init(&rt_hash_locks[i]);
 287 }
 288 #else
 289 # define rt_hash_lock_addr(slot) NULL
 290
 291 static inline void rt_hash_lock_init(void)
 292 {
 293 }
 294 #endif
 295
 296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 297 static unsigned int             rt_hash_mask __read_mostly;
 298 static unsigned int             rt_hash_log  __read_mostly;
 299
 300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 302
 303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 304                                    int genid)
 305 {
 306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 307                             idx, genid)
 308                 & rt_hash_mask;
 309 }
 310
 311 static inline int rt_genid(struct net *net)
 312 {
 313         return atomic_read(&net->ipv4.rt_genid);
 314 }
 315
 316 #ifdef CONFIG_PROC_FS
 317 struct rt_cache_iter_state {
 318         struct seq_net_private p;
 319         int bucket;
 320         int genid;
 321 };
 322
 323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 324 {
 325         struct rt_cache_iter_state *st = seq->private;
 326         struct rtable *r = NULL;
 327
 328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 330                         continue;
 331                 rcu_read_lock_bh();
 332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 333                 while (r) {
 334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 335                             r->rt_genid == st->genid)
 336                                 return r;
 337                         r = rcu_dereference_bh(r->dst.rt_next);
 338                 }
 339                 rcu_read_unlock_bh();
 340         }
 341         return r;
 342 }
 343
 344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 345                                           struct rtable *r)
 346 {
 347         struct rt_cache_iter_state *st = seq->private;
 348
 349         r = rcu_dereference_bh(r->dst.rt_next);
 350         while (!r) {
 351                 rcu_read_unlock_bh();
 352                 do {
 353                         if (--st->bucket < 0)
 354                                 return NULL;
 355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 356                 rcu_read_lock_bh();
 357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 358         }
 359         return r;
 360 }
 361
 362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 363                                         struct rtable *r)
 364 {
 365         struct rt_cache_iter_state *st = seq->private;
 366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 368                         continue;
 369                 if (r->rt_genid == st->genid)
 370                         break;
 371         }
 372         return r;
 373 }
 374
 375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 376 {
 377         struct rtable *r = rt_cache_get_first(seq);
 378
 379         if (r)
 380                 while (pos && (r = rt_cache_get_next(seq, r)))
 381                         --pos;
 382         return pos ? NULL : r;
 383 }
 384
 385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 386 {
 387         struct rt_cache_iter_state *st = seq->private;
 388         if (*pos)
 389                 return rt_cache_get_idx(seq, *pos - 1);
 390         st->genid = rt_genid(seq_file_net(seq));
 391         return SEQ_START_TOKEN;
 392 }
 393
 394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 395 {
 396         struct rtable *r;
 397
 398         if (v == SEQ_START_TOKEN)
 399                 r = rt_cache_get_first(seq);
 400         else
 401                 r = rt_cache_get_next(seq, v);
 402         ++*pos;
 403         return r;
 404 }
 405
 406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 407 {
 408         if (v && v != SEQ_START_TOKEN)
 409                 rcu_read_unlock_bh();
 410 }
 411
 412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 413 {
 414         if (v == SEQ_START_TOKEN)
 415                 seq_printf(seq, "%-127s\n",
 416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 418                            "HHUptod\tSpecDst");
 419         else {
 420                 struct rtable *r = v;
 421                 struct neighbour *n;
 422                 int len, HHUptod;
 423
 424                 rcu_read_lock();
 425                 n = dst_get_neighbour_noref(&r->dst);
 426                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 427                 rcu_read_unlock();
 428
 429                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 430                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 431                         r->dst.dev ? r->dst.dev->name : "*",
 432                         (__force u32)r->rt_dst,
 433                         (__force u32)r->rt_gateway,
 434                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 435                         r->dst.__use, 0, (__force u32)r->rt_src,
 436                         dst_metric_advmss(&r->dst) + 40,
 437                         dst_metric(&r->dst, RTAX_WINDOW),
 438                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 439                               dst_metric(&r->dst, RTAX_RTTVAR)),
 440                         r->rt_key_tos,
 441                         -1,
 442                         HHUptod,
 443                         r->rt_spec_dst, &len);
 444
 445                 seq_printf(seq, "%*s\n", 127 - len, "");
 446         }
 447         return 0;
 448 }
 449
 450 static const struct seq_operations rt_cache_seq_ops = {
 451         .start  = rt_cache_seq_start,
 452         .next   = rt_cache_seq_next,
 453         .stop   = rt_cache_seq_stop,
 454         .show   = rt_cache_seq_show,
 455 };
 456
 457 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 458 {
 459         return seq_open_net(inode, file, &rt_cache_seq_ops,
 460                         sizeof(struct rt_cache_iter_state));
 461 }
 462
 463 static const struct file_operations rt_cache_seq_fops = {
 464         .owner   = THIS_MODULE,
 465         .open    = rt_cache_seq_open,
 466         .read    = seq_read,
 467         .llseek  = seq_lseek,
 468         .release = seq_release_net,
 469 };
 470
 471
 472 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 473 {
 474         int cpu;
 475
 476         if (*pos == 0)
 477                 return SEQ_START_TOKEN;
 478
 479         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 480                 if (!cpu_possible(cpu))
 481                         continue;
 482                 *pos = cpu+1;
 483                 return &per_cpu(rt_cache_stat, cpu);
 484         }
 485         return NULL;
 486 }
 487
 488 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 489 {
 490         int cpu;
 491
 492         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 493                 if (!cpu_possible(cpu))
 494                         continue;
 495                 *pos = cpu+1;
 496                 return &per_cpu(rt_cache_stat, cpu);
 497         }
 498         return NULL;
 499
 500 }
 501
 502 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 503 {
 504
 505 }
 506
 507 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 508 {
 509         struct rt_cache_stat *st = v;
 510
 511         if (v == SEQ_START_TOKEN) {
 512                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 513                 return 0;
 514         }
 515
 516         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 517                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 518                    dst_entries_get_slow(&ipv4_dst_ops),
 519                    st->in_hit,
 520                    st->in_slow_tot,
 521                    st->in_slow_mc,
 522                    st->in_no_route,
 523                    st->in_brd,
 524                    st->in_martian_dst,
 525                    st->in_martian_src,
 526
 527                    st->out_hit,
 528                    st->out_slow_tot,
 529                    st->out_slow_mc,
 530
 531                    st->gc_total,
 532                    st->gc_ignored,
 533                    st->gc_goal_miss,
 534                    st->gc_dst_overflow,
 535                    st->in_hlist_search,
 536                    st->out_hlist_search
 537                 );
 538         return 0;
 539 }
 540
 541 static const struct seq_operations rt_cpu_seq_ops = {
 542         .start  = rt_cpu_seq_start,
 543         .next   = rt_cpu_seq_next,
 544         .stop   = rt_cpu_seq_stop,
 545         .show   = rt_cpu_seq_show,
 546 };
 547
 548
 549 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 550 {
 551         return seq_open(file, &rt_cpu_seq_ops);
 552 }
 553
 554 static const struct file_operations rt_cpu_seq_fops = {
 555         .owner   = THIS_MODULE,
 556         .open    = rt_cpu_seq_open,
 557         .read    = seq_read,
 558         .llseek  = seq_lseek,
 559         .release = seq_release,
 560 };
 561
 562 #ifdef CONFIG_IP_ROUTE_CLASSID
 563 static int rt_acct_proc_show(struct seq_file *m, void *v)
 564 {
 565         struct ip_rt_acct *dst, *src;
 566         unsigned int i, j;
 567
 568         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 569         if (!dst)
 570                 return -ENOMEM;
 571
 572         for_each_possible_cpu(i) {
 573                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 574                 for (j = 0; j < 256; j++) {
 575                         dst[j].o_bytes   += src[j].o_bytes;
 576                         dst[j].o_packets += src[j].o_packets;
 577                         dst[j].i_bytes   += src[j].i_bytes;
 578                         dst[j].i_packets += src[j].i_packets;
 579                 }
 580         }
 581
 582         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 583         kfree(dst);
 584         return 0;
 585 }
 586
 587 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 588 {
 589         return single_open(file, rt_acct_proc_show, NULL);
 590 }
 591
 592 static const struct file_operations rt_acct_proc_fops = {
 593         .owner          = THIS_MODULE,
 594         .open           = rt_acct_proc_open,
 595         .read           = seq_read,
 596         .llseek         = seq_lseek,
 597         .release        = single_release,
 598 };
 599 #endif
 600
 601 static int __net_init ip_rt_do_proc_init(struct net *net)
 602 {
 603         struct proc_dir_entry *pde;
 604
 605         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 606                         &rt_cache_seq_fops);
 607         if (!pde)
 608                 goto err1;
 609
 610         pde = proc_create("rt_cache", S_IRUGO,
 611                           net->proc_net_stat, &rt_cpu_seq_fops);
 612         if (!pde)
 613                 goto err2;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 617         if (!pde)
 618                 goto err3;
 619 #endif
 620         return 0;
 621
 622 #ifdef CONFIG_IP_ROUTE_CLASSID
 623 err3:
 624         remove_proc_entry("rt_cache", net->proc_net_stat);
 625 #endif
 626 err2:
 627         remove_proc_entry("rt_cache", net->proc_net);
 628 err1:
 629         return -ENOMEM;
 630 }
 631
 632 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 633 {
 634         remove_proc_entry("rt_cache", net->proc_net_stat);
 635         remove_proc_entry("rt_cache", net->proc_net);
 636 #ifdef CONFIG_IP_ROUTE_CLASSID
 637         remove_proc_entry("rt_acct", net->proc_net);
 638 #endif
 639 }
 640
 641 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 642         .init = ip_rt_do_proc_init,
 643         .exit = ip_rt_do_proc_exit,
 644 };
 645
 646 static int __init ip_rt_proc_init(void)
 647 {
 648         return register_pernet_subsys(&ip_rt_proc_ops);
 649 }
 650
 651 #else
 652 static inline int ip_rt_proc_init(void)
 653 {
 654         return 0;
 655 }
 656 #endif /* CONFIG_PROC_FS */
 657
 658 static inline void rt_free(struct rtable *rt)
 659 {
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline void rt_drop(struct rtable *rt)
 664 {
 665         ip_rt_put(rt);
 666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 667 }
 668
 669 static inline int rt_fast_clean(struct rtable *rth)
 670 {
 671         /* Kill broadcast/multicast entries very aggresively, if they
 672            collide in hash table with more useful entries */
 673         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 674                 rt_is_input_route(rth) && rth->dst.rt_next;
 675 }
 676
 677 static inline int rt_valuable(struct rtable *rth)
 678 {
 679         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 680                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 681 }
 682
 683 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 684 {
 685         unsigned long age;
 686         int ret = 0;
 687
 688         if (atomic_read(&rth->dst.__refcnt))
 689                 goto out;
 690
 691         age = jiffies - rth->dst.lastuse;
 692         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 693             (age <= tmo2 && rt_valuable(rth)))
 694                 goto out;
 695         ret = 1;
 696 out:    return ret;
 697 }
 698
 699 /* Bits of score are:
 700  * 31: very valuable
 701  * 30: not quite useless
 702  * 29..0: usage counter
 703  */
 704 static inline u32 rt_score(struct rtable *rt)
 705 {
 706         u32 score = jiffies - rt->dst.lastuse;
 707
 708         score = ~score & ~(3<<30);
 709
 710         if (rt_valuable(rt))
 711                 score |= (1<<31);
 712
 713         if (rt_is_output_route(rt) ||
 714             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 715                 score |= (1<<30);
 716
 717         return score;
 718 }
 719
 720 static inline bool rt_caching(const struct net *net)
 721 {
 722         return net->ipv4.current_rt_cache_rebuild_count <=
 723                 net->ipv4.sysctl_rt_cache_rebuild_count;
 724 }
 725
 726 static inline bool compare_hash_inputs(const struct rtable *rt1,
 727                                        const struct rtable *rt2)
 728 {
 729         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 732 }
 733
 734 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 735 {
 736         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 737                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 738                 (rt1->rt_mark ^ rt2->rt_mark) |
 739                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 740                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 741                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 742 }
 743
 744 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 745 {
 746         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 747 }
 748
 749 static inline int rt_is_expired(struct rtable *rth)
 750 {
 751         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 752 }
 753
 754 /*
 755  * Perform a full scan of hash table and free all entries.
 756  * Can be called by a softirq or a process.
 757  * In the later case, we want to be reschedule if necessary
 758  */
 759 static void rt_do_flush(struct net *net, int process_context)
 760 {
 761         unsigned int i;
 762         struct rtable *rth, *next;
 763
 764         for (i = 0; i <= rt_hash_mask; i++) {
 765                 struct rtable __rcu **pprev;
 766                 struct rtable *list;
 767
 768                 if (process_context && need_resched())
 769                         cond_resched();
 770                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 771                 if (!rth)
 772                         continue;
 773
 774                 spin_lock_bh(rt_hash_lock_addr(i));
 775
 776                 list = NULL;
 777                 pprev = &rt_hash_table[i].chain;
 778                 rth = rcu_dereference_protected(*pprev,
 779                         lockdep_is_held(rt_hash_lock_addr(i)));
 780
 781                 while (rth) {
 782                         next = rcu_dereference_protected(rth->dst.rt_next,
 783                                 lockdep_is_held(rt_hash_lock_addr(i)));
 784
 785                         if (!net ||
 786                             net_eq(dev_net(rth->dst.dev), net)) {
 787                                 rcu_assign_pointer(*pprev, next);
 788                                 rcu_assign_pointer(rth->dst.rt_next, list);
 789                                 list = rth;
 790                         } else {
 791                                 pprev = &rth->dst.rt_next;
 792                         }
 793                         rth = next;
 794                 }
 795
 796                 spin_unlock_bh(rt_hash_lock_addr(i));
 797
 798                 for (; list; list = next) {
 799                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 800                         rt_free(list);
 801                 }
 802         }
 803 }
 804
 805 /*
 806  * While freeing expired entries, we compute average chain length
 807  * and standard deviation, using fixed-point arithmetic.
 808  * This to have an estimation of rt_chain_length_max
 809  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 810  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 811  */
 812
 813 #define FRACT_BITS 3
 814 #define ONE (1UL << FRACT_BITS)
 815
 816 /*
 817  * Given a hash chain and an item in this hash chain,
 818  * find if a previous entry has the same hash_inputs
 819  * (but differs on tos, mark or oif)
 820  * Returns 0 if an alias is found.
 821  * Returns ONE if rth has no alias before itself.
 822  */
 823 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 824 {
 825         const struct rtable *aux = head;
 826
 827         while (aux != rth) {
 828                 if (compare_hash_inputs(aux, rth))
 829                         return 0;
 830                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 831         }
 832         return ONE;
 833 }
 834
 835 static void rt_check_expire(void)
 836 {
 837         static unsigned int rover;
 838         unsigned int i = rover, goal;
 839         struct rtable *rth;
 840         struct rtable __rcu **rthp;
 841         unsigned long samples = 0;
 842         unsigned long sum = 0, sum2 = 0;
 843         unsigned long delta;
 844         u64 mult;
 845
 846         delta = jiffies - expires_ljiffies;
 847         expires_ljiffies = jiffies;
 848         mult = ((u64)delta) << rt_hash_log;
 849         if (ip_rt_gc_timeout > 1)
 850                 do_div(mult, ip_rt_gc_timeout);
 851         goal = (unsigned int)mult;
 852         if (goal > rt_hash_mask)
 853                 goal = rt_hash_mask + 1;
 854         for (; goal > 0; goal--) {
 855                 unsigned long tmo = ip_rt_gc_timeout;
 856                 unsigned long length;
 857
 858                 i = (i + 1) & rt_hash_mask;
 859                 rthp = &rt_hash_table[i].chain;
 860
 861                 if (need_resched())
 862                         cond_resched();
 863
 864                 samples++;
 865
 866                 if (rcu_dereference_raw(*rthp) == NULL)
 867                         continue;
 868                 length = 0;
 869                 spin_lock_bh(rt_hash_lock_addr(i));
 870                 while ((rth = rcu_dereference_protected(*rthp,
 871                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 872                         prefetch(rth->dst.rt_next);
 873                         if (rt_is_expired(rth)) {
 874                                 *rthp = rth->dst.rt_next;
 875                                 rt_free(rth);
 876                                 continue;
 877                         }
 878                         if (rth->dst.expires) {
 879                                 /* Entry is expired even if it is in use */
 880                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 881 nofree:
 882                                         tmo >>= 1;
 883                                         rthp = &rth->dst.rt_next;
 884                                         /*
 885                                          * We only count entries on
 886                                          * a chain with equal hash inputs once
 887                                          * so that entries for different QOS
 888                                          * levels, and other non-hash input
 889                                          * attributes don't unfairly skew
 890                                          * the length computation
 891                                          */
 892                                         length += has_noalias(rt_hash_table[i].chain, rth);
 893                                         continue;
 894                                 }
 895                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 896                                 goto nofree;
 897
 898                         /* Cleanup aged off entries. */
 899                         *rthp = rth->dst.rt_next;
 900                         rt_free(rth);
 901                 }
 902                 spin_unlock_bh(rt_hash_lock_addr(i));
 903                 sum += length;
 904                 sum2 += length*length;
 905         }
 906         if (samples) {
 907                 unsigned long avg = sum / samples;
 908                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 909                 rt_chain_length_max = max_t(unsigned long,
 910                                         ip_rt_gc_elasticity,
 911                                         (avg + 4*sd) >> FRACT_BITS);
 912         }
 913         rover = i;
 914 }
 915
 916 /*
 917  * rt_worker_func() is run in process context.
 918  * we call rt_check_expire() to scan part of the hash table
 919  */
 920 static void rt_worker_func(struct work_struct *work)
 921 {
 922         rt_check_expire();
 923         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 924 }
 925
 926 /*
 927  * Perturbation of rt_genid by a small quantity [1..256]
 928  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 929  * many times (2^24) without giving recent rt_genid.
 930  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 931  */
 932 static void rt_cache_invalidate(struct net *net)
 933 {
 934         unsigned char shuffle;
 935
 936         get_random_bytes(&shuffle, sizeof(shuffle));
 937         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 938         inetpeer_invalidate_tree(net->ipv4.peers);
 939 }
 940
 941 /*
 942  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 943  * delay >= 0 : invalidate & flush cache (can be long)
 944  */
 945 void rt_cache_flush(struct net *net, int delay)
 946 {
 947         rt_cache_invalidate(net);
 948         if (delay >= 0)
 949                 rt_do_flush(net, !in_softirq());
 950 }
 951
 952 /* Flush previous cache invalidated entries from the cache */
 953 void rt_cache_flush_batch(struct net *net)
 954 {
 955         rt_do_flush(net, !in_softirq());
 956 }
 957
 958 static void rt_emergency_hash_rebuild(struct net *net)
 959 {
 960         net_warn_ratelimited("Route hash chain too long!\n");
 961         rt_cache_invalidate(net);
 962 }
 963
 964 /*
 965    Short description of GC goals.
 966
 967    We want to build algorithm, which will keep routing cache
 968    at some equilibrium point, when number of aged off entries
 969    is kept approximately equal to newly generated ones.
 970
 971    Current expiration strength is variable "expire".
 972    We try to adjust it dynamically, so that if networking
 973    is idle expires is large enough to keep enough of warm entries,
 974    and when load increases it reduces to limit cache size.
 975  */
 976
 977 static int rt_garbage_collect(struct dst_ops *ops)
 978 {
 979         static unsigned long expire = RT_GC_TIMEOUT;
 980         static unsigned long last_gc;
 981         static int rover;
 982         static int equilibrium;
 983         struct rtable *rth;
 984         struct rtable __rcu **rthp;
 985         unsigned long now = jiffies;
 986         int goal;
 987         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 988
 989         /*
 990          * Garbage collection is pretty expensive,
 991          * do not make it too frequently.
 992          */
 993
 994         RT_CACHE_STAT_INC(gc_total);
 995
 996         if (now - last_gc < ip_rt_gc_min_interval &&
 997             entries < ip_rt_max_size) {
 998                 RT_CACHE_STAT_INC(gc_ignored);
 999                 goto out;
1000         }
1001
1002         entries = dst_entries_get_slow(&ipv4_dst_ops);
1003         /* Calculate number of entries, which we want to expire now. */
1004         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1005         if (goal <= 0) {
1006                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1007                         equilibrium = ipv4_dst_ops.gc_thresh;
1008                 goal = entries - equilibrium;
1009                 if (goal > 0) {
1010                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011                         goal = entries - equilibrium;
1012                 }
1013         } else {
1014                 /* We are in dangerous area. Try to reduce cache really
1015                  * aggressively.
1016                  */
1017                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1018                 equilibrium = entries - goal;
1019         }
1020
1021         if (now - last_gc >= ip_rt_gc_min_interval)
1022                 last_gc = now;
1023
1024         if (goal <= 0) {
1025                 equilibrium += goal;
1026                 goto work_done;
1027         }
1028
1029         do {
1030                 int i, k;
1031
1032                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1033                         unsigned long tmo = expire;
1034
1035                         k = (k + 1) & rt_hash_mask;
1036                         rthp = &rt_hash_table[k].chain;
1037                         spin_lock_bh(rt_hash_lock_addr(k));
1038                         while ((rth = rcu_dereference_protected(*rthp,
1039                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1040                                 if (!rt_is_expired(rth) &&
1041                                         !rt_may_expire(rth, tmo, expire)) {
1042                                         tmo >>= 1;
1043                                         rthp = &rth->dst.rt_next;
1044                                         continue;
1045                                 }
1046                                 *rthp = rth->dst.rt_next;
1047                                 rt_free(rth);
1048                                 goal--;
1049                         }
1050                         spin_unlock_bh(rt_hash_lock_addr(k));
1051                         if (goal <= 0)
1052                                 break;
1053                 }
1054                 rover = k;
1055
1056                 if (goal <= 0)
1057                         goto work_done;
1058
1059                 /* Goal is not achieved. We stop process if:
1060
1061                    - if expire reduced to zero. Otherwise, expire is halfed.
1062                    - if table is not full.
1063                    - if we are called from interrupt.
1064                    - jiffies check is just fallback/debug loop breaker.
1065                      We will not spin here for long time in any case.
1066                  */
1067
1068                 RT_CACHE_STAT_INC(gc_goal_miss);
1069
1070                 if (expire == 0)
1071                         break;
1072
1073                 expire >>= 1;
1074
1075                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076                         goto out;
1077         } while (!in_softirq() && time_before_eq(jiffies, now));
1078
1079         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         net_warn_ratelimited("dst cache overflow\n");
1084         RT_CACHE_STAT_INC(gc_dst_overflow);
1085         return 1;
1086
1087 work_done:
1088         expire += ip_rt_gc_min_interval;
1089         if (expire > ip_rt_gc_timeout ||
1090             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1091             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1092                 expire = ip_rt_gc_timeout;
1093 out:    return 0;
1094 }
1095
1096 /*
1097  * Returns number of entries in a hash chain that have different hash_inputs
1098  */
1099 static int slow_chain_length(const struct rtable *head)
1100 {
1101         int length = 0;
1102         const struct rtable *rth = head;
1103
1104         while (rth) {
1105                 length += has_noalias(head, rth);
1106                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1107         }
1108         return length >> FRACT_BITS;
1109 }
1110
1111 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1112 {
1113         static const __be32 inaddr_any = 0;
1114         struct net_device *dev = dst->dev;
1115         const __be32 *pkey = daddr;
1116         const struct rtable *rt;
1117         struct neighbour *n;
1118
1119         rt = (const struct rtable *) dst;
1120
1121         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                 pkey = &inaddr_any;
1123         else if (rt->rt_gateway)
1124                 pkey = (const __be32 *) &rt->rt_gateway;
1125
1126         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1127         if (n)
1128                 return n;
1129         return neigh_create(&arp_tbl, pkey, dev);
1130 }
1131
1132 static int rt_bind_neighbour(struct rtable *rt)
1133 {
1134         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1135         if (IS_ERR(n))
1136                 return PTR_ERR(n);
1137         dst_set_neighbour(&rt->dst, n);
1138
1139         return 0;
1140 }
1141
1142 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1143                                      struct sk_buff *skb, int ifindex)
1144 {
1145         struct rtable   *rth, *cand;
1146         struct rtable __rcu **rthp, **candp;
1147         unsigned long   now;
1148         u32             min_score;
1149         int             chain_length;
1150         int attempts = !in_softirq();
1151
1152 restart:
1153         chain_length = 0;
1154         min_score = ~(u32)0;
1155         cand = NULL;
1156         candp = NULL;
1157         now = jiffies;
1158
1159         if (!rt_caching(dev_net(rt->dst.dev))) {
1160                 /*
1161                  * If we're not caching, just tell the caller we
1162                  * were successful and don't touch the route.  The
1163                  * caller hold the sole reference to the cache entry, and
1164                  * it will be released when the caller is done with it.
1165                  * If we drop it here, the callers have no way to resolve routes
1166                  * when we're not caching.  Instead, just point *rp at rt, so
1167                  * the caller gets a single use out of the route
1168                  * Note that we do rt_free on this new route entry, so that
1169                  * once its refcount hits zero, we are still able to reap it
1170                  * (Thanks Alexey)
1171                  * Note: To avoid expensive rcu stuff for this uncached dst,
1172                  * we set DST_NOCACHE so that dst_release() can free dst without
1173                  * waiting a grace period.
1174                  */
1175
1176                 rt->dst.flags |= DST_NOCACHE;
1177                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                         int err = rt_bind_neighbour(rt);
1179                         if (err) {
1180                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1181                                 ip_rt_put(rt);
1182                                 return ERR_PTR(err);
1183                         }
1184                 }
1185
1186                 goto skip_hashing;
1187         }
1188
1189         rthp = &rt_hash_table[hash].chain;
1190
1191         spin_lock_bh(rt_hash_lock_addr(hash));
1192         while ((rth = rcu_dereference_protected(*rthp,
1193                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                 if (rt_is_expired(rth)) {
1195                         *rthp = rth->dst.rt_next;
1196                         rt_free(rth);
1197                         continue;
1198                 }
1199                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                         /* Put it first */
1201                         *rthp = rth->dst.rt_next;
1202                         /*
1203                          * Since lookup is lockfree, the deletion
1204                          * must be visible to another weakly ordered CPU before
1205                          * the insertion at the start of the hash chain.
1206                          */
1207                         rcu_assign_pointer(rth->dst.rt_next,
1208                                            rt_hash_table[hash].chain);
1209                         /*
1210                          * Since lookup is lockfree, the update writes
1211                          * must be ordered for consistency on SMP.
1212                          */
1213                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                         dst_use(&rth->dst, now);
1216                         spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                         rt_drop(rt);
1219                         if (skb)
1220                                 skb_dst_set(skb, &rth->dst);
1221                         return rth;
1222                 }
1223
1224                 if (!atomic_read(&rth->dst.__refcnt)) {
1225                         u32 score = rt_score(rth);
1226
1227                         if (score <= min_score) {
1228                                 cand = rth;
1229                                 candp = rthp;
1230                                 min_score = score;
1231                         }
1232                 }
1233
1234                 chain_length++;
1235
1236                 rthp = &rth->dst.rt_next;
1237         }
1238
1239         if (cand) {
1240                 /* ip_rt_gc_elasticity used to be average length of chain
1241                  * length, when exceeded gc becomes really aggressive.
1242                  *
1243                  * The second limit is less certain. At the moment it allows
1244                  * only 2 entries per bucket. We will see.
1245                  */
1246                 if (chain_length > ip_rt_gc_elasticity) {
1247                         *candp = cand->dst.rt_next;
1248                         rt_free(cand);
1249                 }
1250         } else {
1251                 if (chain_length > rt_chain_length_max &&
1252                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                         struct net *net = dev_net(rt->dst.dev);
1254                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                         if (!rt_caching(net)) {
1256                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1257                                         rt->dst.dev->name, num);
1258                         }
1259                         rt_emergency_hash_rebuild(net);
1260                         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                         ifindex, rt_genid(net));
1264                         goto restart;
1265                 }
1266         }
1267
1268         /* Try to bind route to arp only if it is output
1269            route or unicast forwarding path.
1270          */
1271         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                 int err = rt_bind_neighbour(rt);
1273                 if (err) {
1274                         spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                         if (err != -ENOBUFS) {
1277                                 rt_drop(rt);
1278                                 return ERR_PTR(err);
1279                         }
1280
1281                         /* Neighbour tables are full and nothing
1282                            can be released. Try to shrink route cache,
1283                            it is most likely it holds some neighbour records.
1284                          */
1285                         if (attempts-- > 0) {
1286                                 int saved_elasticity = ip_rt_gc_elasticity;
1287                                 int saved_int = ip_rt_gc_min_interval;
1288                                 ip_rt_gc_elasticity     = 1;
1289                                 ip_rt_gc_min_interval   = 0;
1290                                 rt_garbage_collect(&ipv4_dst_ops);
1291                                 ip_rt_gc_min_interval   = saved_int;
1292                                 ip_rt_gc_elasticity     = saved_elasticity;
1293                                 goto restart;
1294                         }
1295
1296                         net_warn_ratelimited("Neighbour table overflow\n");
1297                         rt_drop(rt);
1298                         return ERR_PTR(-ENOBUFS);
1299                 }
1300         }
1301
1302         rt->dst.rt_next = rt_hash_table[hash].chain;
1303
1304         /*
1305          * Since lookup is lockfree, we must make sure
1306          * previous writes to rt are committed to memory
1307          * before making rt visible to other CPUS.
1308          */
1309         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1310
1311         spin_unlock_bh(rt_hash_lock_addr(hash));
1312
1313 skip_hashing:
1314         if (skb)
1315                 skb_dst_set(skb, &rt->dst);
1316         return rt;
1317 }
1318
1319 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1320
1321 static u32 rt_peer_genid(void)
1322 {
1323         return atomic_read(&__rt_peer_genid);
1324 }
1325
1326 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1327 {
1328         struct inet_peer_base *base;
1329         struct inet_peer *peer;
1330
1331         base = inetpeer_base_ptr(rt->_peer);
1332         if (!base)
1333                 return;
1334
1335         peer = inet_getpeer_v4(base, daddr, create);
1336
1337         if (!rt_set_peer(rt, peer))
1338                 inet_putpeer(peer);
1339         else
1340                 rt->rt_peer_genid = rt_peer_genid();
1341 }
1342
1343 /*
1344  * Peer allocation may fail only in serious out-of-memory conditions.  However
1345  * we still can generate some output.
1346  * Random ID selection looks a bit dangerous because we have no chances to
1347  * select ID being unique in a reasonable period of time.
1348  * But broken packet identifier may be better than no packet at all.
1349  */
1350 static void ip_select_fb_ident(struct iphdr *iph)
1351 {
1352         static DEFINE_SPINLOCK(ip_fb_id_lock);
1353         static u32 ip_fallback_id;
1354         u32 salt;
1355
1356         spin_lock_bh(&ip_fb_id_lock);
1357         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1358         iph->id = htons(salt & 0xFFFF);
1359         ip_fallback_id = salt;
1360         spin_unlock_bh(&ip_fb_id_lock);
1361 }
1362
1363 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1364 {
1365         struct rtable *rt = (struct rtable *) dst;
1366
1367         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1368                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1369
1370                 /* If peer is attached to destination, it is never detached,
1371                    so that we need not to grab a lock to dereference it.
1372                  */
1373                 if (peer) {
1374                         iph->id = htons(inet_getid(peer, more));
1375                         return;
1376                 }
1377         } else if (!rt)
1378                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1379
1380         ip_select_fb_ident(iph);
1381 }
1382 EXPORT_SYMBOL(__ip_select_ident);
1383
1384 static void rt_del(unsigned int hash, struct rtable *rt)
1385 {
1386         struct rtable __rcu **rthp;
1387         struct rtable *aux;
1388
1389         rthp = &rt_hash_table[hash].chain;
1390         spin_lock_bh(rt_hash_lock_addr(hash));
1391         ip_rt_put(rt);
1392         while ((aux = rcu_dereference_protected(*rthp,
1393                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1394                 if (aux == rt || rt_is_expired(aux)) {
1395                         *rthp = aux->dst.rt_next;
1396                         rt_free(aux);
1397                         continue;
1398                 }
1399                 rthp = &aux->dst.rt_next;
1400         }
1401         spin_unlock_bh(rt_hash_lock_addr(hash));
1402 }
1403
1404 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1405 {
1406         struct rtable *rt = (struct rtable *) dst;
1407         __be32 orig_gw = rt->rt_gateway;
1408         struct neighbour *n, *old_n;
1409
1410         dst_confirm(&rt->dst);
1411
1412         rt->rt_gateway = peer->redirect_learned.a4;
1413
1414         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1415         if (IS_ERR(n)) {
1416                 rt->rt_gateway = orig_gw;
1417                 return;
1418         }
1419         old_n = xchg(&rt->dst._neighbour, n);
1420         if (old_n)
1421                 neigh_release(old_n);
1422         if (!(n->nud_state & NUD_VALID)) {
1423                 neigh_event_send(n, NULL);
1424         } else {
1425                 rt->rt_flags |= RTCF_REDIRECTED;
1426                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1427         }
1428 }
1429
1430 /* called in rcu_read_lock() section */
1431 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432                     __be32 saddr, struct net_device *dev)
1433 {
1434         int s, i;
1435         struct in_device *in_dev = __in_dev_get_rcu(dev);
1436         __be32 skeys[2] = { saddr, 0 };
1437         int    ikeys[2] = { dev->ifindex, 0 };
1438         struct inet_peer *peer;
1439         struct net *net;
1440
1441         if (!in_dev)
1442                 return;
1443
1444         net = dev_net(dev);
1445         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1446             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1447             ipv4_is_zeronet(new_gw))
1448                 goto reject_redirect;
1449
1450         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1451                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1452                         goto reject_redirect;
1453                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1454                         goto reject_redirect;
1455         } else {
1456                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1457                         goto reject_redirect;
1458         }
1459
1460         for (s = 0; s < 2; s++) {
1461                 for (i = 0; i < 2; i++) {
1462                         unsigned int hash;
1463                         struct rtable __rcu **rthp;
1464                         struct rtable *rt;
1465
1466                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1467
1468                         rthp = &rt_hash_table[hash].chain;
1469
1470                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1471                                 rthp = &rt->dst.rt_next;
1472
1473                                 if (rt->rt_key_dst != daddr ||
1474                                     rt->rt_key_src != skeys[s] ||
1475                                     rt->rt_oif != ikeys[i] ||
1476                                     rt_is_input_route(rt) ||
1477                                     rt_is_expired(rt) ||
1478                                     !net_eq(dev_net(rt->dst.dev), net) ||
1479                                     rt->dst.error ||
1480                                     rt->dst.dev != dev ||
1481                                     rt->rt_gateway != old_gw)
1482                                         continue;
1483
1484                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1485                                 if (peer) {
1486                                         if (peer->redirect_learned.a4 != new_gw) {
1487                                                 peer->redirect_learned.a4 = new_gw;
1488                                                 atomic_inc(&__rt_peer_genid);
1489                                         }
1490                                         check_peer_redir(&rt->dst, peer);
1491                                 }
1492                         }
1493                 }
1494         }
1495         return;
1496
1497 reject_redirect:
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499         if (IN_DEV_LOG_MARTIANS(in_dev))
1500                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1501                                      "  Advised path = %pI4 -> %pI4\n",
1502                                      &old_gw, dev->name, &new_gw,
1503                                      &saddr, &daddr);
1504 #endif
1505         ;
1506 }
1507
1508 static bool peer_pmtu_expired(struct inet_peer *peer)
1509 {
1510         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1511
1512         return orig &&
1513                time_after_eq(jiffies, orig) &&
1514                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1515 }
1516
1517 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1518 {
1519         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1520
1521         return orig &&
1522                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1523 }
1524
1525 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1526 {
1527         struct rtable *rt = (struct rtable *)dst;
1528         struct dst_entry *ret = dst;
1529
1530         if (rt) {
1531                 if (dst->obsolete > 0) {
1532                         ip_rt_put(rt);
1533                         ret = NULL;
1534                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1535                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1536                                                 rt->rt_oif,
1537                                                 rt_genid(dev_net(dst->dev)));
1538                         rt_del(hash, rt);
1539                         ret = NULL;
1540                 } else if (rt_has_peer(rt)) {
1541                         struct inet_peer *peer = rt_peer_ptr(rt);
1542                         if (peer_pmtu_expired(peer))
1543                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1544                 }
1545         }
1546         return ret;
1547 }
1548
1549 /*
1550  * Algorithm:
1551  *      1. The first ip_rt_redirect_number redirects are sent
1552  *         with exponential backoff, then we stop sending them at all,
1553  *         assuming that the host ignores our redirects.
1554  *      2. If we did not see packets requiring redirects
1555  *         during ip_rt_redirect_silence, we assume that the host
1556  *         forgot redirected route and start to send redirects again.
1557  *
1558  * This algorithm is much cheaper and more intelligent than dumb load limiting
1559  * in icmp.c.
1560  *
1561  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1562  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1563  */
1564
1565 void ip_rt_send_redirect(struct sk_buff *skb)
1566 {
1567         struct rtable *rt = skb_rtable(skb);
1568         struct in_device *in_dev;
1569         struct inet_peer *peer;
1570         int log_martians;
1571
1572         rcu_read_lock();
1573         in_dev = __in_dev_get_rcu(rt->dst.dev);
1574         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1575                 rcu_read_unlock();
1576                 return;
1577         }
1578         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1579         rcu_read_unlock();
1580
1581         peer = rt_get_peer_create(rt, rt->rt_dst);
1582         if (!peer) {
1583                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1584                 return;
1585         }
1586
1587         /* No redirected packets during ip_rt_redirect_silence;
1588          * reset the algorithm.
1589          */
1590         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1591                 peer->rate_tokens = 0;
1592
1593         /* Too many ignored redirects; do not send anything
1594          * set dst.rate_last to the last seen redirected packet.
1595          */
1596         if (peer->rate_tokens >= ip_rt_redirect_number) {
1597                 peer->rate_last = jiffies;
1598                 return;
1599         }
1600
1601         /* Check for load limit; set rate_last to the latest sent
1602          * redirect.
1603          */
1604         if (peer->rate_tokens == 0 ||
1605             time_after(jiffies,
1606                        (peer->rate_last +
1607                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1608                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1609                 peer->rate_last = jiffies;
1610                 ++peer->rate_tokens;
1611 #ifdef CONFIG_IP_ROUTE_VERBOSE
1612                 if (log_martians &&
1613                     peer->rate_tokens == ip_rt_redirect_number)
1614                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1615                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1616                                              &rt->rt_dst, &rt->rt_gateway);
1617 #endif
1618         }
1619 }
1620
1621 static int ip_error(struct sk_buff *skb)
1622 {
1623         struct rtable *rt = skb_rtable(skb);
1624         struct inet_peer *peer;
1625         unsigned long now;
1626         bool send;
1627         int code;
1628
1629         switch (rt->dst.error) {
1630         case EINVAL:
1631         default:
1632                 goto out;
1633         case EHOSTUNREACH:
1634                 code = ICMP_HOST_UNREACH;
1635                 break;
1636         case ENETUNREACH:
1637                 code = ICMP_NET_UNREACH;
1638                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1639                                 IPSTATS_MIB_INNOROUTES);
1640                 break;
1641         case EACCES:
1642                 code = ICMP_PKT_FILTERED;
1643                 break;
1644         }
1645
1646         peer = rt_get_peer_create(rt, rt->rt_dst);
1647
1648         send = true;
1649         if (peer) {
1650                 now = jiffies;
1651                 peer->rate_tokens += now - peer->rate_last;
1652                 if (peer->rate_tokens > ip_rt_error_burst)
1653                         peer->rate_tokens = ip_rt_error_burst;
1654                 peer->rate_last = now;
1655                 if (peer->rate_tokens >= ip_rt_error_cost)
1656                         peer->rate_tokens -= ip_rt_error_cost;
1657                 else
1658                         send = false;
1659         }
1660         if (send)
1661                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1662
1663 out:    kfree_skb(skb);
1664         return 0;
1665 }
1666
1667 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1668 {
1669         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1670
1671         if (!expires)
1672                 return;
1673         if (time_before(jiffies, expires)) {
1674                 u32 orig_dst_mtu = dst_mtu(dst);
1675                 if (peer->pmtu_learned < orig_dst_mtu) {
1676                         if (!peer->pmtu_orig)
1677                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1678                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1679                 }
1680         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1681                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1682 }
1683
1684 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1685 {
1686         struct rtable *rt = (struct rtable *) dst;
1687         struct inet_peer *peer;
1688
1689         dst_confirm(dst);
1690
1691         peer = rt_get_peer_create(rt, rt->rt_dst);
1692         if (peer) {
1693                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1694
1695                 if (mtu < ip_rt_min_pmtu)
1696                         mtu = ip_rt_min_pmtu;
1697                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1698
1699                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1700                         if (!pmtu_expires)
1701                                 pmtu_expires = 1UL;
1702
1703                         peer->pmtu_learned = mtu;
1704                         peer->pmtu_expires = pmtu_expires;
1705
1706                         atomic_inc(&__rt_peer_genid);
1707                         rt->rt_peer_genid = rt_peer_genid();
1708                 }
1709                 check_peer_pmtu(dst, peer);
1710         }
1711 }
1712
1713
1714 static void ipv4_validate_peer(struct rtable *rt)
1715 {
1716         if (rt->rt_peer_genid != rt_peer_genid()) {
1717                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1718
1719                 if (peer) {
1720                         check_peer_pmtu(&rt->dst, peer);
1721
1722                         if (peer->redirect_learned.a4 &&
1723                             peer->redirect_learned.a4 != rt->rt_gateway)
1724                                 check_peer_redir(&rt->dst, peer);
1725                 }
1726
1727                 rt->rt_peer_genid = rt_peer_genid();
1728         }
1729 }
1730
1731 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1732 {
1733         struct rtable *rt = (struct rtable *) dst;
1734
1735         if (rt_is_expired(rt))
1736                 return NULL;
1737         ipv4_validate_peer(rt);
1738         return dst;
1739 }
1740
1741 static void ipv4_dst_destroy(struct dst_entry *dst)
1742 {
1743         struct rtable *rt = (struct rtable *) dst;
1744
1745         if (rt->fi) {
1746                 fib_info_put(rt->fi);
1747                 rt->fi = NULL;
1748         }
1749         if (rt_has_peer(rt)) {
1750                 struct inet_peer *peer = rt_peer_ptr(rt);
1751                 inet_putpeer(peer);
1752         }
1753 }
1754
1755
1756 static void ipv4_link_failure(struct sk_buff *skb)
1757 {
1758         struct rtable *rt;
1759
1760         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1761
1762         rt = skb_rtable(skb);
1763         if (rt && rt_has_peer(rt)) {
1764                 struct inet_peer *peer = rt_peer_ptr(rt);
1765                 if (peer_pmtu_cleaned(peer))
1766                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1767         }
1768 }
1769
1770 static int ip_rt_bug(struct sk_buff *skb)
1771 {
1772         pr_debug("%s: %pI4 -> %pI4, %s\n",
1773                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1774                  skb->dev ? skb->dev->name : "?");
1775         kfree_skb(skb);
1776         WARN_ON(1);
1777         return 0;
1778 }
1779
1780 /*
1781    We do not cache source address of outgoing interface,
1782    because it is used only by IP RR, TS and SRR options,
1783    so that it out of fast path.
1784
1785    BTW remember: "addr" is allowed to be not aligned
1786    in IP options!
1787  */
1788
1789 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1790 {
1791         __be32 src;
1792
1793         if (rt_is_output_route(rt))
1794                 src = ip_hdr(skb)->saddr;
1795         else {
1796                 struct fib_result res;
1797                 struct flowi4 fl4;
1798                 struct iphdr *iph;
1799
1800                 iph = ip_hdr(skb);
1801
1802                 memset(&fl4, 0, sizeof(fl4));
1803                 fl4.daddr = iph->daddr;
1804                 fl4.saddr = iph->saddr;
1805                 fl4.flowi4_tos = RT_TOS(iph->tos);
1806                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1807                 fl4.flowi4_iif = skb->dev->ifindex;
1808                 fl4.flowi4_mark = skb->mark;
1809
1810                 rcu_read_lock();
1811                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1812                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1813                 else
1814                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1815                                         RT_SCOPE_UNIVERSE);
1816                 rcu_read_unlock();
1817         }
1818         memcpy(addr, &src, 4);
1819 }
1820
1821 #ifdef CONFIG_IP_ROUTE_CLASSID
1822 static void set_class_tag(struct rtable *rt, u32 tag)
1823 {
1824         if (!(rt->dst.tclassid & 0xFFFF))
1825                 rt->dst.tclassid |= tag & 0xFFFF;
1826         if (!(rt->dst.tclassid & 0xFFFF0000))
1827                 rt->dst.tclassid |= tag & 0xFFFF0000;
1828 }
1829 #endif
1830
1831 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1832 {
1833         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1834
1835         if (advmss == 0) {
1836                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1837                                ip_rt_min_advmss);
1838                 if (advmss > 65535 - 40)
1839                         advmss = 65535 - 40;
1840         }
1841         return advmss;
1842 }
1843
1844 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1845 {
1846         const struct rtable *rt = (const struct rtable *) dst;
1847         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1848
1849         if (mtu && rt_is_output_route(rt))
1850                 return mtu;
1851
1852         mtu = dst->dev->mtu;
1853
1854         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1855
1856                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1857                         mtu = 576;
1858         }
1859
1860         if (mtu > IP_MAX_MTU)
1861                 mtu = IP_MAX_MTU;
1862
1863         return mtu;
1864 }
1865
1866 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1867                             struct fib_info *fi)
1868 {
1869         struct inet_peer_base *base;
1870         struct inet_peer *peer;
1871         int create = 0;
1872
1873         /* If a peer entry exists for this destination, we must hook
1874          * it up in order to get at cached metrics.
1875          */
1876         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1877                 create = 1;
1878
1879         base = inetpeer_base_ptr(rt->_peer);
1880         BUG_ON(!base);
1881
1882         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1883         if (peer) {
1884                 __rt_set_peer(rt, peer);
1885                 rt->rt_peer_genid = rt_peer_genid();
1886                 if (inet_metrics_new(peer))
1887                         memcpy(peer->metrics, fi->fib_metrics,
1888                                sizeof(u32) * RTAX_MAX);
1889                 dst_init_metrics(&rt->dst, peer->metrics, false);
1890
1891                 check_peer_pmtu(&rt->dst, peer);
1892
1893                 if (peer->redirect_learned.a4 &&
1894                     peer->redirect_learned.a4 != rt->rt_gateway) {
1895                         rt->rt_gateway = peer->redirect_learned.a4;
1896                         rt->rt_flags |= RTCF_REDIRECTED;
1897                 }
1898         } else {
1899                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1900                         rt->fi = fi;
1901                         atomic_inc(&fi->fib_clntref);
1902                 }
1903                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1904         }
1905 }
1906
1907 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1908                            const struct fib_result *res,
1909                            struct fib_info *fi, u16 type, u32 itag)
1910 {
1911         struct dst_entry *dst = &rt->dst;
1912
1913         if (fi) {
1914                 if (FIB_RES_GW(*res) &&
1915                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1916                         rt->rt_gateway = FIB_RES_GW(*res);
1917                 rt_init_metrics(rt, fl4, fi);
1918 #ifdef CONFIG_IP_ROUTE_CLASSID
1919                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1920 #endif
1921         }
1922
1923         if (dst_mtu(dst) > IP_MAX_MTU)
1924                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1925         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1926                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1927
1928 #ifdef CONFIG_IP_ROUTE_CLASSID
1929 #ifdef CONFIG_IP_MULTIPLE_TABLES
1930         set_class_tag(rt, fib_rules_tclass(res));
1931 #endif
1932         set_class_tag(rt, itag);
1933 #endif
1934 }
1935
1936 static struct rtable *rt_dst_alloc(struct net_device *dev,
1937                                    bool nopolicy, bool noxfrm)
1938 {
1939         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1940                          DST_HOST |
1941                          (nopolicy ? DST_NOPOLICY : 0) |
1942                          (noxfrm ? DST_NOXFRM : 0));
1943 }
1944
1945 /* called in rcu_read_lock() section */
1946 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1947                                 u8 tos, struct net_device *dev, int our)
1948 {
1949         unsigned int hash;
1950         struct rtable *rth;
1951         __be32 spec_dst;
1952         struct in_device *in_dev = __in_dev_get_rcu(dev);
1953         u32 itag = 0;
1954         int err;
1955
1956         /* Primary sanity checks. */
1957
1958         if (in_dev == NULL)
1959                 return -EINVAL;
1960
1961         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1962             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1963                 goto e_inval;
1964
1965         if (ipv4_is_zeronet(saddr)) {
1966                 if (!ipv4_is_local_multicast(daddr))
1967                         goto e_inval;
1968                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1969         } else {
1970                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1971                                           &itag);
1972                 if (err < 0)
1973                         goto e_err;
1974         }
1975         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1976                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1977         if (!rth)
1978                 goto e_nobufs;
1979
1980 #ifdef CONFIG_IP_ROUTE_CLASSID
1981         rth->dst.tclassid = itag;
1982 #endif
1983         rth->dst.output = ip_rt_bug;
1984
1985         rth->rt_key_dst = daddr;
1986         rth->rt_key_src = saddr;
1987         rth->rt_genid   = rt_genid(dev_net(dev));
1988         rth->rt_flags   = RTCF_MULTICAST;
1989         rth->rt_type    = RTN_MULTICAST;
1990         rth->rt_key_tos = tos;
1991         rth->rt_dst     = daddr;
1992         rth->rt_src     = saddr;
1993         rth->rt_route_iif = dev->ifindex;
1994         rth->rt_iif     = dev->ifindex;
1995         rth->rt_oif     = 0;
1996         rth->rt_mark    = skb->mark;
1997         rth->rt_gateway = daddr;
1998         rth->rt_spec_dst= spec_dst;
1999         rth->rt_peer_genid = 0;
2000         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2001         rth->fi = NULL;
2002         if (our) {
2003                 rth->dst.input= ip_local_deliver;
2004                 rth->rt_flags |= RTCF_LOCAL;
2005         }
2006
2007 #ifdef CONFIG_IP_MROUTE
2008         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2009                 rth->dst.input = ip_mr_input;
2010 #endif
2011         RT_CACHE_STAT_INC(in_slow_mc);
2012
2013         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2014         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2015         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2016
2017 e_nobufs:
2018         return -ENOBUFS;
2019 e_inval:
2020         return -EINVAL;
2021 e_err:
2022         return err;
2023 }
2024
2025
2026 static void ip_handle_martian_source(struct net_device *dev,
2027                                      struct in_device *in_dev,
2028                                      struct sk_buff *skb,
2029                                      __be32 daddr,
2030                                      __be32 saddr)
2031 {
2032         RT_CACHE_STAT_INC(in_martian_src);
2033 #ifdef CONFIG_IP_ROUTE_VERBOSE
2034         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2035                 /*
2036                  *      RFC1812 recommendation, if source is martian,
2037                  *      the only hint is MAC header.
2038                  */
2039                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2040                         &daddr, &saddr, dev->name);
2041                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2042                         print_hex_dump(KERN_WARNING, "ll header: ",
2043                                        DUMP_PREFIX_OFFSET, 16, 1,
2044                                        skb_mac_header(skb),
2045                                        dev->hard_header_len, true);
2046                 }
2047         }
2048 #endif
2049 }
2050
2051 /* called in rcu_read_lock() section */
2052 static int __mkroute_input(struct sk_buff *skb,
2053                            const struct fib_result *res,
2054                            struct in_device *in_dev,
2055                            __be32 daddr, __be32 saddr, u32 tos,
2056                            struct rtable **result)
2057 {
2058         struct rtable *rth;
2059         int err;
2060         struct in_device *out_dev;
2061         unsigned int flags = 0;
2062         __be32 spec_dst;
2063         u32 itag;
2064
2065         /* get a working reference to the output device */
2066         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2067         if (out_dev == NULL) {
2068                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2069                 return -EINVAL;
2070         }
2071
2072
2073         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2074                                   in_dev->dev, &spec_dst, &itag);
2075         if (err < 0) {
2076                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2077                                          saddr);
2078
2079                 goto cleanup;
2080         }
2081
2082         if (err)
2083                 flags |= RTCF_DIRECTSRC;
2084
2085         if (out_dev == in_dev && err &&
2086             (IN_DEV_SHARED_MEDIA(out_dev) ||
2087              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2088                 flags |= RTCF_DOREDIRECT;
2089
2090         if (skb->protocol != htons(ETH_P_IP)) {
2091                 /* Not IP (i.e. ARP). Do not create route, if it is
2092                  * invalid for proxy arp. DNAT routes are always valid.
2093                  *
2094                  * Proxy arp feature have been extended to allow, ARP
2095                  * replies back to the same interface, to support
2096                  * Private VLAN switch technologies. See arp.c.
2097                  */
2098                 if (out_dev == in_dev &&
2099                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2100                         err = -EINVAL;
2101                         goto cleanup;
2102                 }
2103         }
2104
2105         rth = rt_dst_alloc(out_dev->dev,
2106                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2107                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2108         if (!rth) {
2109                 err = -ENOBUFS;
2110                 goto cleanup;
2111         }
2112
2113         rth->rt_key_dst = daddr;
2114         rth->rt_key_src = saddr;
2115         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2116         rth->rt_flags = flags;
2117         rth->rt_type = res->type;
2118         rth->rt_key_tos = tos;
2119         rth->rt_dst     = daddr;
2120         rth->rt_src     = saddr;
2121         rth->rt_route_iif = in_dev->dev->ifindex;
2122         rth->rt_iif     = in_dev->dev->ifindex;
2123         rth->rt_oif     = 0;
2124         rth->rt_mark    = skb->mark;
2125         rth->rt_gateway = daddr;
2126         rth->rt_spec_dst= spec_dst;
2127         rth->rt_peer_genid = 0;
2128         rt_init_peer(rth, dev_net(rth->dst.dev)->ipv4.peers);
2129         rth->fi = NULL;
2130
2131         rth->dst.input = ip_forward;
2132         rth->dst.output = ip_output;
2133
2134         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2135
2136         *result = rth;
2137         err = 0;
2138  cleanup:
2139         return err;
2140 }
2141
2142 static int ip_mkroute_input(struct sk_buff *skb,
2143                             struct fib_result *res,
2144                             const struct flowi4 *fl4,
2145                             struct in_device *in_dev,
2146                             __be32 daddr, __be32 saddr, u32 tos)
2147 {
2148         struct rtable *rth = NULL;
2149         int err;
2150         unsigned int hash;
2151
2152 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2153         if (res->fi && res->fi->fib_nhs > 1)
2154                 fib_select_multipath(res);
2155 #endif
2156
2157         /* create a routing cache entry */
2158         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2159         if (err)
2160                 return err;
2161
2162         /* put it into the cache */
2163         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2164                        rt_genid(dev_net(rth->dst.dev)));
2165         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2166         if (IS_ERR(rth))
2167                 return PTR_ERR(rth);
2168         return 0;
2169 }
2170
2171 /*
2172  *      NOTE. We drop all the packets that has local source
2173  *      addresses, because every properly looped back packet
2174  *      must have correct destination already attached by output routine.
2175  *
2176  *      Such approach solves two big problems:
2177  *      1. Not simplex devices are handled properly.
2178  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2179  *      called with rcu_read_lock()
2180  */
2181
2182 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2183                                u8 tos, struct net_device *dev)
2184 {
2185         struct fib_result res;
2186         struct in_device *in_dev = __in_dev_get_rcu(dev);
2187         struct flowi4   fl4;
2188         unsigned int    flags = 0;
2189         u32             itag = 0;
2190         struct rtable   *rth;
2191         unsigned int    hash;
2192         __be32          spec_dst;
2193         int             err = -EINVAL;
2194         struct net    *net = dev_net(dev);
2195
2196         /* IP on this device is disabled. */
2197
2198         if (!in_dev)
2199                 goto out;
2200
2201         /* Check for the most weird martians, which can be not detected
2202            by fib_lookup.
2203          */
2204
2205         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2206             ipv4_is_loopback(saddr))
2207                 goto martian_source;
2208
2209         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2210                 goto brd_input;
2211
2212         /* Accept zero addresses only to limited broadcast;
2213          * I even do not know to fix it or not. Waiting for complains :-)
2214          */
2215         if (ipv4_is_zeronet(saddr))
2216                 goto martian_source;
2217
2218         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2219                 goto martian_destination;
2220
2221         /*
2222          *      Now we are ready to route packet.
2223          */
2224         fl4.flowi4_oif = 0;
2225         fl4.flowi4_iif = dev->ifindex;
2226         fl4.flowi4_mark = skb->mark;
2227         fl4.flowi4_tos = tos;
2228         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2229         fl4.daddr = daddr;
2230         fl4.saddr = saddr;
2231         err = fib_lookup(net, &fl4, &res);
2232         if (err != 0) {
2233                 if (!IN_DEV_FORWARD(in_dev))
2234                         goto e_hostunreach;
2235                 goto no_route;
2236         }
2237
2238         RT_CACHE_STAT_INC(in_slow_tot);
2239
2240         if (res.type == RTN_BROADCAST)
2241                 goto brd_input;
2242
2243         if (res.type == RTN_LOCAL) {
2244                 err = fib_validate_source(skb, saddr, daddr, tos,
2245                                           net->loopback_dev->ifindex,
2246                                           dev, &spec_dst, &itag);
2247                 if (err < 0)
2248                         goto martian_source_keep_err;
2249                 if (err)
2250                         flags |= RTCF_DIRECTSRC;
2251                 spec_dst = daddr;
2252                 goto local_input;
2253         }
2254
2255         if (!IN_DEV_FORWARD(in_dev))
2256                 goto e_hostunreach;
2257         if (res.type != RTN_UNICAST)
2258                 goto martian_destination;
2259
2260         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2261 out:    return err;
2262
2263 brd_input:
2264         if (skb->protocol != htons(ETH_P_IP))
2265                 goto e_inval;
2266
2267         if (ipv4_is_zeronet(saddr))
2268                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2269         else {
2270                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2271                                           &itag);
2272                 if (err < 0)
2273                         goto martian_source_keep_err;
2274                 if (err)
2275                         flags |= RTCF_DIRECTSRC;
2276         }
2277         flags |= RTCF_BROADCAST;
2278         res.type = RTN_BROADCAST;
2279         RT_CACHE_STAT_INC(in_brd);
2280
2281 local_input:
2282         rth = rt_dst_alloc(net->loopback_dev,
2283                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2284         if (!rth)
2285                 goto e_nobufs;
2286
2287         rth->dst.input= ip_local_deliver;
2288         rth->dst.output= ip_rt_bug;
2289 #ifdef CONFIG_IP_ROUTE_CLASSID
2290         rth->dst.tclassid = itag;
2291 #endif
2292
2293         rth->rt_key_dst = daddr;
2294         rth->rt_key_src = saddr;
2295         rth->rt_genid = rt_genid(net);
2296         rth->rt_flags   = flags|RTCF_LOCAL;
2297         rth->rt_type    = res.type;
2298         rth->rt_key_tos = tos;
2299         rth->rt_dst     = daddr;
2300         rth->rt_src     = saddr;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302         rth->dst.tclassid = itag;
2303 #endif
2304         rth->rt_route_iif = dev->ifindex;
2305         rth->rt_iif     = dev->ifindex;
2306         rth->rt_oif     = 0;
2307         rth->rt_mark    = skb->mark;
2308         rth->rt_gateway = daddr;
2309         rth->rt_spec_dst= spec_dst;
2310         rth->rt_peer_genid = 0;
2311         rt_init_peer(rth, net->ipv4.peers);
2312         rth->fi = NULL;
2313         if (res.type == RTN_UNREACHABLE) {
2314                 rth->dst.input= ip_error;
2315                 rth->dst.error= -err;
2316                 rth->rt_flags   &= ~RTCF_LOCAL;
2317         }
2318         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2319         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2320         err = 0;
2321         if (IS_ERR(rth))
2322                 err = PTR_ERR(rth);
2323         goto out;
2324
2325 no_route:
2326         RT_CACHE_STAT_INC(in_no_route);
2327         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2328         res.type = RTN_UNREACHABLE;
2329         if (err == -ESRCH)
2330                 err = -ENETUNREACH;
2331         goto local_input;
2332
2333         /*
2334          *      Do not cache martian addresses: they should be logged (RFC1812)
2335          */
2336 martian_destination:
2337         RT_CACHE_STAT_INC(in_martian_dst);
2338 #ifdef CONFIG_IP_ROUTE_VERBOSE
2339         if (IN_DEV_LOG_MARTIANS(in_dev))
2340                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2341                                      &daddr, &saddr, dev->name);
2342 #endif
2343
2344 e_hostunreach:
2345         err = -EHOSTUNREACH;
2346         goto out;
2347
2348 e_inval:
2349         err = -EINVAL;
2350         goto out;
2351
2352 e_nobufs:
2353         err = -ENOBUFS;
2354         goto out;
2355
2356 martian_source:
2357         err = -EINVAL;
2358 martian_source_keep_err:
2359         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2360         goto out;
2361 }
2362
2363 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2364                            u8 tos, struct net_device *dev, bool noref)
2365 {
2366         struct rtable   *rth;
2367         unsigned int    hash;
2368         int iif = dev->ifindex;
2369         struct net *net;
2370         int res;
2371
2372         net = dev_net(dev);
2373
2374         rcu_read_lock();
2375
2376         if (!rt_caching(net))
2377                 goto skip_cache;
2378
2379         tos &= IPTOS_RT_MASK;
2380         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2381
2382         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2383              rth = rcu_dereference(rth->dst.rt_next)) {
2384                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2385                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2386                      (rth->rt_route_iif ^ iif) |
2387                      (rth->rt_key_tos ^ tos)) == 0 &&
2388                     rth->rt_mark == skb->mark &&
2389                     net_eq(dev_net(rth->dst.dev), net) &&
2390                     !rt_is_expired(rth)) {
2391                         ipv4_validate_peer(rth);
2392                         if (noref) {
2393                                 dst_use_noref(&rth->dst, jiffies);
2394                                 skb_dst_set_noref(skb, &rth->dst);
2395                         } else {
2396                                 dst_use(&rth->dst, jiffies);
2397                                 skb_dst_set(skb, &rth->dst);
2398                         }
2399                         RT_CACHE_STAT_INC(in_hit);
2400                         rcu_read_unlock();
2401                         return 0;
2402                 }
2403                 RT_CACHE_STAT_INC(in_hlist_search);
2404         }
2405
2406 skip_cache:
2407         /* Multicast recognition logic is moved from route cache to here.
2408            The problem was that too many Ethernet cards have broken/missing
2409            hardware multicast filters :-( As result the host on multicasting
2410            network acquires a lot of useless route cache entries, sort of
2411            SDR messages from all the world. Now we try to get rid of them.
2412            Really, provided software IP multicast filter is organized
2413            reasonably (at least, hashed), it does not result in a slowdown
2414            comparing with route cache reject entries.
2415            Note, that multicast routers are not affected, because
2416            route cache entry is created eventually.
2417          */
2418         if (ipv4_is_multicast(daddr)) {
2419                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2420
2421                 if (in_dev) {
2422                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2423                                                   ip_hdr(skb)->protocol);
2424                         if (our
2425 #ifdef CONFIG_IP_MROUTE
2426                                 ||
2427                             (!ipv4_is_local_multicast(daddr) &&
2428                              IN_DEV_MFORWARD(in_dev))
2429 #endif
2430                            ) {
2431                                 int res = ip_route_input_mc(skb, daddr, saddr,
2432                                                             tos, dev, our);
2433                                 rcu_read_unlock();
2434                                 return res;
2435                         }
2436                 }
2437                 rcu_read_unlock();
2438                 return -EINVAL;
2439         }
2440         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2441         rcu_read_unlock();
2442         return res;
2443 }
2444 EXPORT_SYMBOL(ip_route_input_common);
2445
2446 /* called with rcu_read_lock() */
2447 static struct rtable *__mkroute_output(const struct fib_result *res,
2448                                        const struct flowi4 *fl4,
2449                                        __be32 orig_daddr, __be32 orig_saddr,
2450                                        int orig_oif, __u8 orig_rtos,
2451                                        struct net_device *dev_out,
2452                                        unsigned int flags)
2453 {
2454         struct fib_info *fi = res->fi;
2455         struct in_device *in_dev;
2456         u16 type = res->type;
2457         struct rtable *rth;
2458
2459         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2460                 return ERR_PTR(-EINVAL);
2461
2462         if (ipv4_is_lbcast(fl4->daddr))
2463                 type = RTN_BROADCAST;
2464         else if (ipv4_is_multicast(fl4->daddr))
2465                 type = RTN_MULTICAST;
2466         else if (ipv4_is_zeronet(fl4->daddr))
2467                 return ERR_PTR(-EINVAL);
2468
2469         if (dev_out->flags & IFF_LOOPBACK)
2470                 flags |= RTCF_LOCAL;
2471
2472         in_dev = __in_dev_get_rcu(dev_out);
2473         if (!in_dev)
2474                 return ERR_PTR(-EINVAL);
2475
2476         if (type == RTN_BROADCAST) {
2477                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2478                 fi = NULL;
2479         } else if (type == RTN_MULTICAST) {
2480                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2481                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2482                                      fl4->flowi4_proto))
2483                         flags &= ~RTCF_LOCAL;
2484                 /* If multicast route do not exist use
2485                  * default one, but do not gateway in this case.
2486                  * Yes, it is hack.
2487                  */
2488                 if (fi && res->prefixlen < 4)
2489                         fi = NULL;
2490         }
2491
2492         rth = rt_dst_alloc(dev_out,
2493                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2494                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2495         if (!rth)
2496                 return ERR_PTR(-ENOBUFS);
2497
2498         rth->dst.output = ip_output;
2499
2500         rth->rt_key_dst = orig_daddr;
2501         rth->rt_key_src = orig_saddr;
2502         rth->rt_genid = rt_genid(dev_net(dev_out));
2503         rth->rt_flags   = flags;
2504         rth->rt_type    = type;
2505         rth->rt_key_tos = orig_rtos;
2506         rth->rt_dst     = fl4->daddr;
2507         rth->rt_src     = fl4->saddr;
2508         rth->rt_route_iif = 0;
2509         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2510         rth->rt_oif     = orig_oif;
2511         rth->rt_mark    = fl4->flowi4_mark;
2512         rth->rt_gateway = fl4->daddr;
2513         rth->rt_spec_dst= fl4->saddr;
2514         rth->rt_peer_genid = 0;
2515         rt_init_peer(rth, dev_net(dev_out)->ipv4.peers);
2516         rth->fi = NULL;
2517
2518         RT_CACHE_STAT_INC(out_slow_tot);
2519
2520         if (flags & RTCF_LOCAL) {
2521                 rth->dst.input = ip_local_deliver;
2522                 rth->rt_spec_dst = fl4->daddr;
2523         }
2524         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2525                 rth->rt_spec_dst = fl4->saddr;
2526                 if (flags & RTCF_LOCAL &&
2527                     !(dev_out->flags & IFF_LOOPBACK)) {
2528                         rth->dst.output = ip_mc_output;
2529                         RT_CACHE_STAT_INC(out_slow_mc);
2530                 }
2531 #ifdef CONFIG_IP_MROUTE
2532                 if (type == RTN_MULTICAST) {
2533                         if (IN_DEV_MFORWARD(in_dev) &&
2534                             !ipv4_is_local_multicast(fl4->daddr)) {
2535                                 rth->dst.input = ip_mr_input;
2536                                 rth->dst.output = ip_mc_output;
2537                         }
2538                 }
2539 #endif
2540         }
2541
2542         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2543
2544         return rth;
2545 }
2546
2547 /*
2548  * Major route resolver routine.
2549  * called with rcu_read_lock();
2550  */
2551
2552 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2553 {
2554         struct net_device *dev_out = NULL;
2555         __u8 tos = RT_FL_TOS(fl4);
2556         unsigned int flags = 0;
2557         struct fib_result res;
2558         struct rtable *rth;
2559         __be32 orig_daddr;
2560         __be32 orig_saddr;
2561         int orig_oif;
2562
2563         res.fi          = NULL;
2564 #ifdef CONFIG_IP_MULTIPLE_TABLES
2565         res.r           = NULL;
2566 #endif
2567
2568         orig_daddr = fl4->daddr;
2569         orig_saddr = fl4->saddr;
2570         orig_oif = fl4->flowi4_oif;
2571
2572         fl4->flowi4_iif = net->loopback_dev->ifindex;
2573         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2574         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2575                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2576
2577         rcu_read_lock();
2578         if (fl4->saddr) {
2579                 rth = ERR_PTR(-EINVAL);
2580                 if (ipv4_is_multicast(fl4->saddr) ||
2581                     ipv4_is_lbcast(fl4->saddr) ||
2582                     ipv4_is_zeronet(fl4->saddr))
2583                         goto out;
2584
2585                 /* I removed check for oif == dev_out->oif here.
2586                    It was wrong for two reasons:
2587                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2588                       is assigned to multiple interfaces.
2589                    2. Moreover, we are allowed to send packets with saddr
2590                       of another iface. --ANK
2591                  */
2592
2593                 if (fl4->flowi4_oif == 0 &&
2594                     (ipv4_is_multicast(fl4->daddr) ||
2595                      ipv4_is_lbcast(fl4->daddr))) {
2596                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2597                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2598                         if (dev_out == NULL)
2599                                 goto out;
2600
2601                         /* Special hack: user can direct multicasts
2602                            and limited broadcast via necessary interface
2603                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2604                            This hack is not just for fun, it allows
2605                            vic,vat and friends to work.
2606                            They bind socket to loopback, set ttl to zero
2607                            and expect that it will work.
2608                            From the viewpoint of routing cache they are broken,
2609                            because we are not allowed to build multicast path
2610                            with loopback source addr (look, routing cache
2611                            cannot know, that ttl is zero, so that packet
2612                            will not leave this host and route is valid).
2613                            Luckily, this hack is good workaround.
2614                          */
2615
2616                         fl4->flowi4_oif = dev_out->ifindex;
2617                         goto make_route;
2618                 }
2619
2620                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2621                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2622                         if (!__ip_dev_find(net, fl4->saddr, false))
2623                                 goto out;
2624                 }
2625         }
2626
2627
2628         if (fl4->flowi4_oif) {
2629                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2630                 rth = ERR_PTR(-ENODEV);
2631                 if (dev_out == NULL)
2632                         goto out;
2633
2634                 /* RACE: Check return value of inet_select_addr instead. */
2635                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2636                         rth = ERR_PTR(-ENETUNREACH);
2637                         goto out;
2638                 }
2639                 if (ipv4_is_local_multicast(fl4->daddr) ||
2640                     ipv4_is_lbcast(fl4->daddr)) {
2641                         if (!fl4->saddr)
2642                                 fl4->saddr = inet_select_addr(dev_out, 0,
2643                                                               RT_SCOPE_LINK);
2644                         goto make_route;
2645                 }
2646                 if (fl4->saddr) {
2647                         if (ipv4_is_multicast(fl4->daddr))
2648                                 fl4->saddr = inet_select_addr(dev_out, 0,
2649                                                               fl4->flowi4_scope);
2650                         else if (!fl4->daddr)
2651                                 fl4->saddr = inet_select_addr(dev_out, 0,
2652                                                               RT_SCOPE_HOST);
2653                 }
2654         }
2655
2656         if (!fl4->daddr) {
2657                 fl4->daddr = fl4->saddr;
2658                 if (!fl4->daddr)
2659                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2660                 dev_out = net->loopback_dev;
2661                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2662                 res.type = RTN_LOCAL;
2663                 flags |= RTCF_LOCAL;
2664                 goto make_route;
2665         }
2666
2667         if (fib_lookup(net, fl4, &res)) {
2668                 res.fi = NULL;
2669                 if (fl4->flowi4_oif) {
2670                         /* Apparently, routing tables are wrong. Assume,
2671                            that the destination is on link.
2672
2673                            WHY? DW.
2674                            Because we are allowed to send to iface
2675                            even if it has NO routes and NO assigned
2676                            addresses. When oif is specified, routing
2677                            tables are looked up with only one purpose:
2678                            to catch if destination is gatewayed, rather than
2679                            direct. Moreover, if MSG_DONTROUTE is set,
2680                            we send packet, ignoring both routing tables
2681                            and ifaddr state. --ANK
2682
2683
2684                            We could make it even if oif is unknown,
2685                            likely IPv6, but we do not.
2686                          */
2687
2688                         if (fl4->saddr == 0)
2689                                 fl4->saddr = inet_select_addr(dev_out, 0,
2690                                                               RT_SCOPE_LINK);
2691                         res.type = RTN_UNICAST;
2692                         goto make_route;
2693                 }
2694                 rth = ERR_PTR(-ENETUNREACH);
2695                 goto out;
2696         }
2697
2698         if (res.type == RTN_LOCAL) {
2699                 if (!fl4->saddr) {
2700                         if (res.fi->fib_prefsrc)
2701                                 fl4->saddr = res.fi->fib_prefsrc;
2702                         else
2703                                 fl4->saddr = fl4->daddr;
2704                 }
2705                 dev_out = net->loopback_dev;
2706                 fl4->flowi4_oif = dev_out->ifindex;
2707                 res.fi = NULL;
2708                 flags |= RTCF_LOCAL;
2709                 goto make_route;
2710         }
2711
2712 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2713         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2714                 fib_select_multipath(&res);
2715         else
2716 #endif
2717         if (!res.prefixlen &&
2718             res.table->tb_num_default > 1 &&
2719             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2720                 fib_select_default(&res);
2721
2722         if (!fl4->saddr)
2723                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2724
2725         dev_out = FIB_RES_DEV(res);
2726         fl4->flowi4_oif = dev_out->ifindex;
2727
2728
2729 make_route:
2730         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2731                                tos, dev_out, flags);
2732         if (!IS_ERR(rth)) {
2733                 unsigned int hash;
2734
2735                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2736                                rt_genid(dev_net(dev_out)));
2737                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2738         }
2739
2740 out:
2741         rcu_read_unlock();
2742         return rth;
2743 }
2744
2745 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2746 {
2747         struct rtable *rth;
2748         unsigned int hash;
2749
2750         if (!rt_caching(net))
2751                 goto slow_output;
2752
2753         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2754
2755         rcu_read_lock_bh();
2756         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2757                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2758                 if (rth->rt_key_dst == flp4->daddr &&
2759                     rth->rt_key_src == flp4->saddr &&
2760                     rt_is_output_route(rth) &&
2761                     rth->rt_oif == flp4->flowi4_oif &&
2762                     rth->rt_mark == flp4->flowi4_mark &&
2763                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2764                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2765                     net_eq(dev_net(rth->dst.dev), net) &&
2766                     !rt_is_expired(rth)) {
2767                         ipv4_validate_peer(rth);
2768                         dst_use(&rth->dst, jiffies);
2769                         RT_CACHE_STAT_INC(out_hit);
2770                         rcu_read_unlock_bh();
2771                         if (!flp4->saddr)
2772                                 flp4->saddr = rth->rt_src;
2773                         if (!flp4->daddr)
2774                                 flp4->daddr = rth->rt_dst;
2775                         return rth;
2776                 }
2777                 RT_CACHE_STAT_INC(out_hlist_search);
2778         }
2779         rcu_read_unlock_bh();
2780
2781 slow_output:
2782         return ip_route_output_slow(net, flp4);
2783 }
2784 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2785
2786 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2787 {
2788         return NULL;
2789 }
2790
2791 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2792 {
2793         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2794
2795         return mtu ? : dst->dev->mtu;
2796 }
2797
2798 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2799 {
2800 }
2801
2802 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2803                                           unsigned long old)
2804 {
2805         return NULL;
2806 }
2807
2808 static struct dst_ops ipv4_dst_blackhole_ops = {
2809         .family                 =       AF_INET,
2810         .protocol               =       cpu_to_be16(ETH_P_IP),
2811         .destroy                =       ipv4_dst_destroy,
2812         .check                  =       ipv4_blackhole_dst_check,
2813         .mtu                    =       ipv4_blackhole_mtu,
2814         .default_advmss         =       ipv4_default_advmss,
2815         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2816         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2817         .neigh_lookup           =       ipv4_neigh_lookup,
2818 };
2819
2820 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2821 {
2822         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2823         struct rtable *ort = (struct rtable *) dst_orig;
2824
2825         if (rt) {
2826                 struct dst_entry *new = &rt->dst;
2827
2828                 new->__use = 1;
2829                 new->input = dst_discard;
2830                 new->output = dst_discard;
2831                 dst_copy_metrics(new, &ort->dst);
2832
2833                 new->dev = ort->dst.dev;
2834                 if (new->dev)
2835                         dev_hold(new->dev);
2836
2837                 rt->rt_key_dst = ort->rt_key_dst;
2838                 rt->rt_key_src = ort->rt_key_src;
2839                 rt->rt_key_tos = ort->rt_key_tos;
2840                 rt->rt_route_iif = ort->rt_route_iif;
2841                 rt->rt_iif = ort->rt_iif;
2842                 rt->rt_oif = ort->rt_oif;
2843                 rt->rt_mark = ort->rt_mark;
2844
2845                 rt->rt_genid = rt_genid(net);
2846                 rt->rt_flags = ort->rt_flags;
2847                 rt->rt_type = ort->rt_type;
2848                 rt->rt_dst = ort->rt_dst;
2849                 rt->rt_src = ort->rt_src;
2850                 rt->rt_gateway = ort->rt_gateway;
2851                 rt->rt_spec_dst = ort->rt_spec_dst;
2852                 rt_transfer_peer(rt, ort);
2853                 rt->fi = ort->fi;
2854                 if (rt->fi)
2855                         atomic_inc(&rt->fi->fib_clntref);
2856
2857                 dst_free(new);
2858         }
2859
2860         dst_release(dst_orig);
2861
2862         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2863 }
2864
2865 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2866                                     struct sock *sk)
2867 {
2868         struct rtable *rt = __ip_route_output_key(net, flp4);
2869
2870         if (IS_ERR(rt))
2871                 return rt;
2872
2873         if (flp4->flowi4_proto)
2874                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2875                                                    flowi4_to_flowi(flp4),
2876                                                    sk, 0);
2877
2878         return rt;
2879 }
2880 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2881
2882 static int rt_fill_info(struct net *net,
2883                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2884                         int nowait, unsigned int flags)
2885 {
2886         struct rtable *rt = skb_rtable(skb);
2887         struct rtmsg *r;
2888         struct nlmsghdr *nlh;
2889         unsigned long expires = 0;
2890         u32 id = 0, ts = 0, tsage = 0, error;
2891
2892         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2893         if (nlh == NULL)
2894                 return -EMSGSIZE;
2895
2896         r = nlmsg_data(nlh);
2897         r->rtm_family    = AF_INET;
2898         r->rtm_dst_len  = 32;
2899         r->rtm_src_len  = 0;
2900         r->rtm_tos      = rt->rt_key_tos;
2901         r->rtm_table    = RT_TABLE_MAIN;
2902         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2903                 goto nla_put_failure;
2904         r->rtm_type     = rt->rt_type;
2905         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2906         r->rtm_protocol = RTPROT_UNSPEC;
2907         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2908         if (rt->rt_flags & RTCF_NOTIFY)
2909                 r->rtm_flags |= RTM_F_NOTIFY;
2910
2911         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2912                 goto nla_put_failure;
2913         if (rt->rt_key_src) {
2914                 r->rtm_src_len = 32;
2915                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2916                         goto nla_put_failure;
2917         }
2918         if (rt->dst.dev &&
2919             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2920                 goto nla_put_failure;
2921 #ifdef CONFIG_IP_ROUTE_CLASSID
2922         if (rt->dst.tclassid &&
2923             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2924                 goto nla_put_failure;
2925 #endif
2926         if (rt_is_input_route(rt)) {
2927                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2928                         goto nla_put_failure;
2929         } else if (rt->rt_src != rt->rt_key_src) {
2930                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2931                         goto nla_put_failure;
2932         }
2933         if (rt->rt_dst != rt->rt_gateway &&
2934             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2935                 goto nla_put_failure;
2936
2937         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2938                 goto nla_put_failure;
2939
2940         if (rt->rt_mark &&
2941             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2942                 goto nla_put_failure;
2943
2944         error = rt->dst.error;
2945         if (rt_has_peer(rt)) {
2946                 const struct inet_peer *peer = rt_peer_ptr(rt);
2947                 inet_peer_refcheck(peer);
2948                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2949                 if (peer->tcp_ts_stamp) {
2950                         ts = peer->tcp_ts;
2951                         tsage = get_seconds() - peer->tcp_ts_stamp;
2952                 }
2953                 expires = ACCESS_ONCE(peer->pmtu_expires);
2954                 if (expires) {
2955                         if (time_before(jiffies, expires))
2956                                 expires -= jiffies;
2957                         else
2958                                 expires = 0;
2959                 }
2960         }
2961
2962         if (rt_is_input_route(rt)) {
2963 #ifdef CONFIG_IP_MROUTE
2964                 __be32 dst = rt->rt_dst;
2965
2966                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2967                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2968                         int err = ipmr_get_route(net, skb,
2969                                                  rt->rt_src, rt->rt_dst,
2970                                                  r, nowait);
2971                         if (err <= 0) {
2972                                 if (!nowait) {
2973                                         if (err == 0)
2974                                                 return 0;
2975                                         goto nla_put_failure;
2976                                 } else {
2977                                         if (err == -EMSGSIZE)
2978                                                 goto nla_put_failure;
2979                                         error = err;
2980                                 }
2981                         }
2982                 } else
2983 #endif
2984                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2985                                 goto nla_put_failure;
2986         }
2987
2988         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2989                                expires, error) < 0)
2990                 goto nla_put_failure;
2991
2992         return nlmsg_end(skb, nlh);
2993
2994 nla_put_failure:
2995         nlmsg_cancel(skb, nlh);
2996         return -EMSGSIZE;
2997 }
2998
2999 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3000 {
3001         struct net *net = sock_net(in_skb->sk);
3002         struct rtmsg *rtm;
3003         struct nlattr *tb[RTA_MAX+1];
3004         struct rtable *rt = NULL;
3005         __be32 dst = 0;
3006         __be32 src = 0;
3007         u32 iif;
3008         int err;
3009         int mark;
3010         struct sk_buff *skb;
3011
3012         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3013         if (err < 0)
3014                 goto errout;
3015
3016         rtm = nlmsg_data(nlh);
3017
3018         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3019         if (skb == NULL) {
3020                 err = -ENOBUFS;
3021                 goto errout;
3022         }
3023
3024         /* Reserve room for dummy headers, this skb can pass
3025            through good chunk of routing engine.
3026          */
3027         skb_reset_mac_header(skb);
3028         skb_reset_network_header(skb);
3029
3030         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3031         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3032         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3033
3034         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3035         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3036         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3037         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3038
3039         if (iif) {
3040                 struct net_device *dev;
3041
3042                 dev = __dev_get_by_index(net, iif);
3043                 if (dev == NULL) {
3044                         err = -ENODEV;
3045                         goto errout_free;
3046                 }
3047
3048                 skb->protocol   = htons(ETH_P_IP);
3049                 skb->dev        = dev;
3050                 skb->mark       = mark;
3051                 local_bh_disable();
3052                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3053                 local_bh_enable();
3054
3055                 rt = skb_rtable(skb);
3056                 if (err == 0 && rt->dst.error)
3057                         err = -rt->dst.error;
3058         } else {
3059                 struct flowi4 fl4 = {
3060                         .daddr = dst,
3061                         .saddr = src,
3062                         .flowi4_tos = rtm->rtm_tos,
3063                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3064                         .flowi4_mark = mark,
3065                 };
3066                 rt = ip_route_output_key(net, &fl4);
3067
3068                 err = 0;
3069                 if (IS_ERR(rt))
3070                         err = PTR_ERR(rt);
3071         }
3072
3073         if (err)
3074                 goto errout_free;
3075
3076         skb_dst_set(skb, &rt->dst);
3077         if (rtm->rtm_flags & RTM_F_NOTIFY)
3078                 rt->rt_flags |= RTCF_NOTIFY;
3079
3080         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3081                            RTM_NEWROUTE, 0, 0);
3082         if (err <= 0)
3083                 goto errout_free;
3084
3085         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3086 errout:
3087         return err;
3088
3089 errout_free:
3090         kfree_skb(skb);
3091         goto errout;
3092 }
3093
3094 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3095 {
3096         struct rtable *rt;
3097         int h, s_h;
3098         int idx, s_idx;
3099         struct net *net;
3100
3101         net = sock_net(skb->sk);
3102
3103         s_h = cb->args[0];
3104         if (s_h < 0)
3105                 s_h = 0;
3106         s_idx = idx = cb->args[1];
3107         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3108                 if (!rt_hash_table[h].chain)
3109                         continue;
3110                 rcu_read_lock_bh();
3111                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3112                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3113                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3114                                 continue;
3115                         if (rt_is_expired(rt))
3116                                 continue;
3117                         skb_dst_set_noref(skb, &rt->dst);
3118                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3119                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3120                                          1, NLM_F_MULTI) <= 0) {
3121                                 skb_dst_drop(skb);
3122                                 rcu_read_unlock_bh();
3123                                 goto done;
3124                         }
3125                         skb_dst_drop(skb);
3126                 }
3127                 rcu_read_unlock_bh();
3128         }
3129
3130 done:
3131         cb->args[0] = h;
3132         cb->args[1] = idx;
3133         return skb->len;
3134 }
3135
3136 void ip_rt_multicast_event(struct in_device *in_dev)
3137 {
3138         rt_cache_flush(dev_net(in_dev->dev), 0);
3139 }
3140
3141 #ifdef CONFIG_SYSCTL
3142 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3143                                         void __user *buffer,
3144                                         size_t *lenp, loff_t *ppos)
3145 {
3146         if (write) {
3147                 int flush_delay;
3148                 ctl_table ctl;
3149                 struct net *net;
3150
3151                 memcpy(&ctl, __ctl, sizeof(ctl));
3152                 ctl.data = &flush_delay;
3153                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3154
3155                 net = (struct net *)__ctl->extra1;
3156                 rt_cache_flush(net, flush_delay);
3157                 return 0;
3158         }
3159
3160         return -EINVAL;
3161 }
3162
3163 static ctl_table ipv4_route_table[] = {
3164         {
3165                 .procname       = "gc_thresh",
3166                 .data           = &ipv4_dst_ops.gc_thresh,
3167                 .maxlen         = sizeof(int),
3168                 .mode           = 0644,
3169                 .proc_handler   = proc_dointvec,
3170         },
3171         {
3172                 .procname       = "max_size",
3173                 .data           = &ip_rt_max_size,
3174                 .maxlen         = sizeof(int),
3175                 .mode           = 0644,
3176                 .proc_handler   = proc_dointvec,
3177         },
3178         {
3179                 /*  Deprecated. Use gc_min_interval_ms */
3180
3181                 .procname       = "gc_min_interval",
3182                 .data           = &ip_rt_gc_min_interval,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec_jiffies,
3186         },
3187         {
3188                 .procname       = "gc_min_interval_ms",
3189                 .data           = &ip_rt_gc_min_interval,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec_ms_jiffies,
3193         },
3194         {
3195                 .procname       = "gc_timeout",
3196                 .data           = &ip_rt_gc_timeout,
3197                 .maxlen         = sizeof(int),
3198                 .mode           = 0644,
3199                 .proc_handler   = proc_dointvec_jiffies,
3200         },
3201         {
3202                 .procname       = "gc_interval",
3203                 .data           = &ip_rt_gc_interval,
3204                 .maxlen         = sizeof(int),
3205                 .mode           = 0644,
3206                 .proc_handler   = proc_dointvec_jiffies,
3207         },
3208         {
3209                 .procname       = "redirect_load",
3210                 .data           = &ip_rt_redirect_load,
3211                 .maxlen         = sizeof(int),
3212                 .mode           = 0644,
3213                 .proc_handler   = proc_dointvec,
3214         },
3215         {
3216                 .procname       = "redirect_number",
3217                 .data           = &ip_rt_redirect_number,
3218                 .maxlen         = sizeof(int),
3219                 .mode           = 0644,
3220                 .proc_handler   = proc_dointvec,
3221         },
3222         {
3223                 .procname       = "redirect_silence",
3224                 .data           = &ip_rt_redirect_silence,
3225                 .maxlen         = sizeof(int),
3226                 .mode           = 0644,
3227                 .proc_handler   = proc_dointvec,
3228         },
3229         {
3230                 .procname       = "error_cost",
3231                 .data           = &ip_rt_error_cost,
3232                 .maxlen         = sizeof(int),
3233                 .mode           = 0644,
3234                 .proc_handler   = proc_dointvec,
3235         },
3236         {
3237                 .procname       = "error_burst",
3238                 .data           = &ip_rt_error_burst,
3239                 .maxlen         = sizeof(int),
3240                 .mode           = 0644,
3241                 .proc_handler   = proc_dointvec,
3242         },
3243         {
3244                 .procname       = "gc_elasticity",
3245                 .data           = &ip_rt_gc_elasticity,
3246                 .maxlen         = sizeof(int),
3247                 .mode           = 0644,
3248                 .proc_handler   = proc_dointvec,
3249         },
3250         {
3251                 .procname       = "mtu_expires",
3252                 .data           = &ip_rt_mtu_expires,
3253                 .maxlen         = sizeof(int),
3254                 .mode           = 0644,
3255                 .proc_handler   = proc_dointvec_jiffies,
3256         },
3257         {
3258                 .procname       = "min_pmtu",
3259                 .data           = &ip_rt_min_pmtu,
3260                 .maxlen         = sizeof(int),
3261                 .mode           = 0644,
3262                 .proc_handler   = proc_dointvec,
3263         },
3264         {
3265                 .procname       = "min_adv_mss",
3266                 .data           = &ip_rt_min_advmss,
3267                 .maxlen         = sizeof(int),
3268                 .mode           = 0644,
3269                 .proc_handler   = proc_dointvec,
3270         },
3271         { }
3272 };
3273
3274 static struct ctl_table ipv4_route_flush_table[] = {
3275         {
3276                 .procname       = "flush",
3277                 .maxlen         = sizeof(int),
3278                 .mode           = 0200,
3279                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3280         },
3281         { },
3282 };
3283
3284 static __net_init int sysctl_route_net_init(struct net *net)
3285 {
3286         struct ctl_table *tbl;
3287
3288         tbl = ipv4_route_flush_table;
3289         if (!net_eq(net, &init_net)) {
3290                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3291                 if (tbl == NULL)
3292                         goto err_dup;
3293         }
3294         tbl[0].extra1 = net;
3295
3296         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3297         if (net->ipv4.route_hdr == NULL)
3298                 goto err_reg;
3299         return 0;
3300
3301 err_reg:
3302         if (tbl != ipv4_route_flush_table)
3303                 kfree(tbl);
3304 err_dup:
3305         return -ENOMEM;
3306 }
3307
3308 static __net_exit void sysctl_route_net_exit(struct net *net)
3309 {
3310         struct ctl_table *tbl;
3311
3312         tbl = net->ipv4.route_hdr->ctl_table_arg;
3313         unregister_net_sysctl_table(net->ipv4.route_hdr);
3314         BUG_ON(tbl == ipv4_route_flush_table);
3315         kfree(tbl);
3316 }
3317
3318 static __net_initdata struct pernet_operations sysctl_route_ops = {
3319         .init = sysctl_route_net_init,
3320         .exit = sysctl_route_net_exit,
3321 };
3322 #endif
3323
3324 static __net_init int rt_genid_init(struct net *net)
3325 {
3326         get_random_bytes(&net->ipv4.rt_genid,
3327                          sizeof(net->ipv4.rt_genid));
3328         get_random_bytes(&net->ipv4.dev_addr_genid,
3329                          sizeof(net->ipv4.dev_addr_genid));
3330         return 0;
3331 }
3332
3333 static __net_initdata struct pernet_operations rt_genid_ops = {
3334         .init = rt_genid_init,
3335 };
3336
3337 static int __net_init ipv4_inetpeer_init(struct net *net)
3338 {
3339         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3340
3341         if (!bp)
3342                 return -ENOMEM;
3343         inet_peer_base_init(bp);
3344         net->ipv4.peers = bp;
3345         return 0;
3346 }
3347
3348 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3349 {
3350         struct inet_peer_base *bp = net->ipv4.peers;
3351
3352         net->ipv4.peers = NULL;
3353         inetpeer_invalidate_tree(bp);
3354         kfree(bp);
3355 }
3356
3357 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3358         .init   =       ipv4_inetpeer_init,
3359         .exit   =       ipv4_inetpeer_exit,
3360 };
3361
3362 #ifdef CONFIG_IP_ROUTE_CLASSID
3363 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3364 #endif /* CONFIG_IP_ROUTE_CLASSID */
3365
3366 static __initdata unsigned long rhash_entries;
3367 static int __init set_rhash_entries(char *str)
3368 {
3369         ssize_t ret;
3370
3371         if (!str)
3372                 return 0;
3373
3374         ret = kstrtoul(str, 0, &rhash_entries);
3375         if (ret)
3376                 return 0;
3377
3378         return 1;
3379 }
3380 __setup("rhash_entries=", set_rhash_entries);
3381
3382 int __init ip_rt_init(void)
3383 {
3384         int rc = 0;
3385
3386 #ifdef CONFIG_IP_ROUTE_CLASSID
3387         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3388         if (!ip_rt_acct)
3389                 panic("IP: failed to allocate ip_rt_acct\n");
3390 #endif
3391
3392         ipv4_dst_ops.kmem_cachep =
3393                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3394                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3395
3396         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3397
3398         if (dst_entries_init(&ipv4_dst_ops) < 0)
3399                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3400
3401         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3402                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3403
3404         rt_hash_table = (struct rt_hash_bucket *)
3405                 alloc_large_system_hash("IP route cache",
3406                                         sizeof(struct rt_hash_bucket),
3407                                         rhash_entries,
3408                                         (totalram_pages >= 128 * 1024) ?
3409                                         15 : 17,
3410                                         0,
3411                                         &rt_hash_log,
3412                                         &rt_hash_mask,
3413                                         0,
3414                                         rhash_entries ? 0 : 512 * 1024);
3415         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3416         rt_hash_lock_init();
3417
3418         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3419         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3420
3421         devinet_init();
3422         ip_fib_init();
3423
3424         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3425         expires_ljiffies = jiffies;
3426         schedule_delayed_work(&expires_work,
3427                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3428
3429         if (ip_rt_proc_init())
3430                 pr_err("Unable to create route proc files\n");
3431 #ifdef CONFIG_XFRM
3432         xfrm_init();
3433         xfrm4_init(ip_rt_max_size);
3434 #endif
3435         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3436
3437 #ifdef CONFIG_SYSCTL
3438         register_pernet_subsys(&sysctl_route_ops);
3439 #endif
3440         register_pernet_subsys(&rt_genid_ops);
3441         register_pernet_subsys(&ipv4_inetpeer_ops);
3442         return rc;
3443 }
3444
3445 #ifdef CONFIG_SYSCTL
3446 /*
3447  * We really need to sanitize the damn ipv4 init order, then all
3448  * this nonsense will go away.
3449  */
3450 void __init ip_static_sysctl_init(void)
3451 {
3452         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3453 }
3454 #endif