net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         peer = rt_get_peer_create(rt, rt->rt_dst);
 166         if (peer) {
 167                 u32 *old_p = __DST_METRICS_PTR(old);
 168                 unsigned long prev, new;
 169
 170                 p = peer->metrics;
 171                 if (inet_metrics_new(peer))
 172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 173
 174                 new = (unsigned long) p;
 175                 prev = cmpxchg(&dst->_metrics, old, new);
 176
 177                 if (prev != old) {
 178                         p = __DST_METRICS_PTR(prev);
 179                         if (prev & DST_METRICS_READ_ONLY)
 180                                 p = NULL;
 181                 } else {
 182                         if (rt->fi) {
 183                                 fib_info_put(rt->fi);
 184                                 rt->fi = NULL;
 185                         }
 186                 }
 187         }
 188         return p;
 189 }
 190
 191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 192
 193 static struct dst_ops ipv4_dst_ops = {
 194         .family =               AF_INET,
 195         .protocol =             cpu_to_be16(ETH_P_IP),
 196         .gc =                   rt_garbage_collect,
 197         .check =                ipv4_dst_check,
 198         .default_advmss =       ipv4_default_advmss,
 199         .mtu =                  ipv4_mtu,
 200         .cow_metrics =          ipv4_cow_metrics,
 201         .destroy =              ipv4_dst_destroy,
 202         .ifdown =               ipv4_dst_ifdown,
 203         .negative_advice =      ipv4_negative_advice,
 204         .link_failure =         ipv4_link_failure,
 205         .update_pmtu =          ip_rt_update_pmtu,
 206         .local_out =            __ip_local_out,
 207         .neigh_lookup =         ipv4_neigh_lookup,
 208 };
 209
 210 #define ECN_OR_COST(class)      TC_PRIO_##class
 211
 212 const __u8 ip_tos2prio[16] = {
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK)
 229 };
 230 EXPORT_SYMBOL(ip_tos2prio);
 231
 232 /*
 233  * Route cache.
 234  */
 235
 236 /* The locking scheme is rather straight forward:
 237  *
 238  * 1) Read-Copy Update protects the buckets of the central route hash.
 239  * 2) Only writers remove entries, and they hold the lock
 240  *    as they look at rtable reference counts.
 241  * 3) Only readers acquire references to rtable entries,
 242  *    they do so with atomic increments and with the
 243  *    lock held.
 244  */
 245
 246 struct rt_hash_bucket {
 247         struct rtable __rcu     *chain;
 248 };
 249
 250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 251         defined(CONFIG_PROVE_LOCKING)
 252 /*
 253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 254  * The size of this table is a power of two and depends on the number of CPUS.
 255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 256  */
 257 #ifdef CONFIG_LOCKDEP
 258 # define RT_HASH_LOCK_SZ        256
 259 #else
 260 # if NR_CPUS >= 32
 261 #  define RT_HASH_LOCK_SZ       4096
 262 # elif NR_CPUS >= 16
 263 #  define RT_HASH_LOCK_SZ       2048
 264 # elif NR_CPUS >= 8
 265 #  define RT_HASH_LOCK_SZ       1024
 266 # elif NR_CPUS >= 4
 267 #  define RT_HASH_LOCK_SZ       512
 268 # else
 269 #  define RT_HASH_LOCK_SZ       256
 270 # endif
 271 #endif
 272
 273 static spinlock_t       *rt_hash_locks;
 274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 275
 276 static __init void rt_hash_lock_init(void)
 277 {
 278         int i;
 279
 280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 281                         GFP_KERNEL);
 282         if (!rt_hash_locks)
 283                 panic("IP: failed to allocate rt_hash_locks\n");
 284
 285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 286                 spin_lock_init(&rt_hash_locks[i]);
 287 }
 288 #else
 289 # define rt_hash_lock_addr(slot) NULL
 290
 291 static inline void rt_hash_lock_init(void)
 292 {
 293 }
 294 #endif
 295
 296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 297 static unsigned int             rt_hash_mask __read_mostly;
 298 static unsigned int             rt_hash_log  __read_mostly;
 299
 300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 302
 303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 304                                    int genid)
 305 {
 306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 307                             idx, genid)
 308                 & rt_hash_mask;
 309 }
 310
 311 static inline int rt_genid(struct net *net)
 312 {
 313         return atomic_read(&net->ipv4.rt_genid);
 314 }
 315
 316 #ifdef CONFIG_PROC_FS
 317 struct rt_cache_iter_state {
 318         struct seq_net_private p;
 319         int bucket;
 320         int genid;
 321 };
 322
 323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 324 {
 325         struct rt_cache_iter_state *st = seq->private;
 326         struct rtable *r = NULL;
 327
 328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 330                         continue;
 331                 rcu_read_lock_bh();
 332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 333                 while (r) {
 334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 335                             r->rt_genid == st->genid)
 336                                 return r;
 337                         r = rcu_dereference_bh(r->dst.rt_next);
 338                 }
 339                 rcu_read_unlock_bh();
 340         }
 341         return r;
 342 }
 343
 344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 345                                           struct rtable *r)
 346 {
 347         struct rt_cache_iter_state *st = seq->private;
 348
 349         r = rcu_dereference_bh(r->dst.rt_next);
 350         while (!r) {
 351                 rcu_read_unlock_bh();
 352                 do {
 353                         if (--st->bucket < 0)
 354                                 return NULL;
 355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 356                 rcu_read_lock_bh();
 357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 358         }
 359         return r;
 360 }
 361
 362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 363                                         struct rtable *r)
 364 {
 365         struct rt_cache_iter_state *st = seq->private;
 366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 368                         continue;
 369                 if (r->rt_genid == st->genid)
 370                         break;
 371         }
 372         return r;
 373 }
 374
 375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 376 {
 377         struct rtable *r = rt_cache_get_first(seq);
 378
 379         if (r)
 380                 while (pos && (r = rt_cache_get_next(seq, r)))
 381                         --pos;
 382         return pos ? NULL : r;
 383 }
 384
 385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 386 {
 387         struct rt_cache_iter_state *st = seq->private;
 388         if (*pos)
 389                 return rt_cache_get_idx(seq, *pos - 1);
 390         st->genid = rt_genid(seq_file_net(seq));
 391         return SEQ_START_TOKEN;
 392 }
 393
 394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 395 {
 396         struct rtable *r;
 397
 398         if (v == SEQ_START_TOKEN)
 399                 r = rt_cache_get_first(seq);
 400         else
 401                 r = rt_cache_get_next(seq, v);
 402         ++*pos;
 403         return r;
 404 }
 405
 406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 407 {
 408         if (v && v != SEQ_START_TOKEN)
 409                 rcu_read_unlock_bh();
 410 }
 411
 412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 413 {
 414         if (v == SEQ_START_TOKEN)
 415                 seq_printf(seq, "%-127s\n",
 416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 418                            "HHUptod\tSpecDst");
 419         else {
 420                 struct rtable *r = v;
 421                 struct neighbour *n;
 422                 int len, HHUptod;
 423
 424                 rcu_read_lock();
 425                 n = dst_get_neighbour_noref(&r->dst);
 426                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 427                 rcu_read_unlock();
 428
 429                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 430                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 431                         r->dst.dev ? r->dst.dev->name : "*",
 432                         (__force u32)r->rt_dst,
 433                         (__force u32)r->rt_gateway,
 434                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 435                         r->dst.__use, 0, (__force u32)r->rt_src,
 436                         dst_metric_advmss(&r->dst) + 40,
 437                         dst_metric(&r->dst, RTAX_WINDOW),
 438                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 439                               dst_metric(&r->dst, RTAX_RTTVAR)),
 440                         r->rt_key_tos,
 441                         -1,
 442                         HHUptod,
 443                         r->rt_spec_dst, &len);
 444
 445                 seq_printf(seq, "%*s\n", 127 - len, "");
 446         }
 447         return 0;
 448 }
 449
 450 static const struct seq_operations rt_cache_seq_ops = {
 451         .start  = rt_cache_seq_start,
 452         .next   = rt_cache_seq_next,
 453         .stop   = rt_cache_seq_stop,
 454         .show   = rt_cache_seq_show,
 455 };
 456
 457 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 458 {
 459         return seq_open_net(inode, file, &rt_cache_seq_ops,
 460                         sizeof(struct rt_cache_iter_state));
 461 }
 462
 463 static const struct file_operations rt_cache_seq_fops = {
 464         .owner   = THIS_MODULE,
 465         .open    = rt_cache_seq_open,
 466         .read    = seq_read,
 467         .llseek  = seq_lseek,
 468         .release = seq_release_net,
 469 };
 470
 471
 472 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 473 {
 474         int cpu;
 475
 476         if (*pos == 0)
 477                 return SEQ_START_TOKEN;
 478
 479         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 480                 if (!cpu_possible(cpu))
 481                         continue;
 482                 *pos = cpu+1;
 483                 return &per_cpu(rt_cache_stat, cpu);
 484         }
 485         return NULL;
 486 }
 487
 488 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 489 {
 490         int cpu;
 491
 492         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 493                 if (!cpu_possible(cpu))
 494                         continue;
 495                 *pos = cpu+1;
 496                 return &per_cpu(rt_cache_stat, cpu);
 497         }
 498         return NULL;
 499
 500 }
 501
 502 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 503 {
 504
 505 }
 506
 507 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 508 {
 509         struct rt_cache_stat *st = v;
 510
 511         if (v == SEQ_START_TOKEN) {
 512                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 513                 return 0;
 514         }
 515
 516         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 517                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 518                    dst_entries_get_slow(&ipv4_dst_ops),
 519                    st->in_hit,
 520                    st->in_slow_tot,
 521                    st->in_slow_mc,
 522                    st->in_no_route,
 523                    st->in_brd,
 524                    st->in_martian_dst,
 525                    st->in_martian_src,
 526
 527                    st->out_hit,
 528                    st->out_slow_tot,
 529                    st->out_slow_mc,
 530
 531                    st->gc_total,
 532                    st->gc_ignored,
 533                    st->gc_goal_miss,
 534                    st->gc_dst_overflow,
 535                    st->in_hlist_search,
 536                    st->out_hlist_search
 537                 );
 538         return 0;
 539 }
 540
 541 static const struct seq_operations rt_cpu_seq_ops = {
 542         .start  = rt_cpu_seq_start,
 543         .next   = rt_cpu_seq_next,
 544         .stop   = rt_cpu_seq_stop,
 545         .show   = rt_cpu_seq_show,
 546 };
 547
 548
 549 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 550 {
 551         return seq_open(file, &rt_cpu_seq_ops);
 552 }
 553
 554 static const struct file_operations rt_cpu_seq_fops = {
 555         .owner   = THIS_MODULE,
 556         .open    = rt_cpu_seq_open,
 557         .read    = seq_read,
 558         .llseek  = seq_lseek,
 559         .release = seq_release,
 560 };
 561
 562 #ifdef CONFIG_IP_ROUTE_CLASSID
 563 static int rt_acct_proc_show(struct seq_file *m, void *v)
 564 {
 565         struct ip_rt_acct *dst, *src;
 566         unsigned int i, j;
 567
 568         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 569         if (!dst)
 570                 return -ENOMEM;
 571
 572         for_each_possible_cpu(i) {
 573                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 574                 for (j = 0; j < 256; j++) {
 575                         dst[j].o_bytes   += src[j].o_bytes;
 576                         dst[j].o_packets += src[j].o_packets;
 577                         dst[j].i_bytes   += src[j].i_bytes;
 578                         dst[j].i_packets += src[j].i_packets;
 579                 }
 580         }
 581
 582         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 583         kfree(dst);
 584         return 0;
 585 }
 586
 587 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 588 {
 589         return single_open(file, rt_acct_proc_show, NULL);
 590 }
 591
 592 static const struct file_operations rt_acct_proc_fops = {
 593         .owner          = THIS_MODULE,
 594         .open           = rt_acct_proc_open,
 595         .read           = seq_read,
 596         .llseek         = seq_lseek,
 597         .release        = single_release,
 598 };
 599 #endif
 600
 601 static int __net_init ip_rt_do_proc_init(struct net *net)
 602 {
 603         struct proc_dir_entry *pde;
 604
 605         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 606                         &rt_cache_seq_fops);
 607         if (!pde)
 608                 goto err1;
 609
 610         pde = proc_create("rt_cache", S_IRUGO,
 611                           net->proc_net_stat, &rt_cpu_seq_fops);
 612         if (!pde)
 613                 goto err2;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 617         if (!pde)
 618                 goto err3;
 619 #endif
 620         return 0;
 621
 622 #ifdef CONFIG_IP_ROUTE_CLASSID
 623 err3:
 624         remove_proc_entry("rt_cache", net->proc_net_stat);
 625 #endif
 626 err2:
 627         remove_proc_entry("rt_cache", net->proc_net);
 628 err1:
 629         return -ENOMEM;
 630 }
 631
 632 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 633 {
 634         remove_proc_entry("rt_cache", net->proc_net_stat);
 635         remove_proc_entry("rt_cache", net->proc_net);
 636 #ifdef CONFIG_IP_ROUTE_CLASSID
 637         remove_proc_entry("rt_acct", net->proc_net);
 638 #endif
 639 }
 640
 641 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 642         .init = ip_rt_do_proc_init,
 643         .exit = ip_rt_do_proc_exit,
 644 };
 645
 646 static int __init ip_rt_proc_init(void)
 647 {
 648         return register_pernet_subsys(&ip_rt_proc_ops);
 649 }
 650
 651 #else
 652 static inline int ip_rt_proc_init(void)
 653 {
 654         return 0;
 655 }
 656 #endif /* CONFIG_PROC_FS */
 657
 658 static inline void rt_free(struct rtable *rt)
 659 {
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline void rt_drop(struct rtable *rt)
 664 {
 665         ip_rt_put(rt);
 666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 667 }
 668
 669 static inline int rt_fast_clean(struct rtable *rth)
 670 {
 671         /* Kill broadcast/multicast entries very aggresively, if they
 672            collide in hash table with more useful entries */
 673         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 674                 rt_is_input_route(rth) && rth->dst.rt_next;
 675 }
 676
 677 static inline int rt_valuable(struct rtable *rth)
 678 {
 679         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 680                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 681 }
 682
 683 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 684 {
 685         unsigned long age;
 686         int ret = 0;
 687
 688         if (atomic_read(&rth->dst.__refcnt))
 689                 goto out;
 690
 691         age = jiffies - rth->dst.lastuse;
 692         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 693             (age <= tmo2 && rt_valuable(rth)))
 694                 goto out;
 695         ret = 1;
 696 out:    return ret;
 697 }
 698
 699 /* Bits of score are:
 700  * 31: very valuable
 701  * 30: not quite useless
 702  * 29..0: usage counter
 703  */
 704 static inline u32 rt_score(struct rtable *rt)
 705 {
 706         u32 score = jiffies - rt->dst.lastuse;
 707
 708         score = ~score & ~(3<<30);
 709
 710         if (rt_valuable(rt))
 711                 score |= (1<<31);
 712
 713         if (rt_is_output_route(rt) ||
 714             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 715                 score |= (1<<30);
 716
 717         return score;
 718 }
 719
 720 static inline bool rt_caching(const struct net *net)
 721 {
 722         return net->ipv4.current_rt_cache_rebuild_count <=
 723                 net->ipv4.sysctl_rt_cache_rebuild_count;
 724 }
 725
 726 static inline bool compare_hash_inputs(const struct rtable *rt1,
 727                                        const struct rtable *rt2)
 728 {
 729         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 732 }
 733
 734 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 735 {
 736         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 737                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 738                 (rt1->rt_mark ^ rt2->rt_mark) |
 739                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 740                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 741                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 742 }
 743
 744 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 745 {
 746         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 747 }
 748
 749 static inline int rt_is_expired(struct rtable *rth)
 750 {
 751         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 752 }
 753
 754 /*
 755  * Perform a full scan of hash table and free all entries.
 756  * Can be called by a softirq or a process.
 757  * In the later case, we want to be reschedule if necessary
 758  */
 759 static void rt_do_flush(struct net *net, int process_context)
 760 {
 761         unsigned int i;
 762         struct rtable *rth, *next;
 763
 764         for (i = 0; i <= rt_hash_mask; i++) {
 765                 struct rtable __rcu **pprev;
 766                 struct rtable *list;
 767
 768                 if (process_context && need_resched())
 769                         cond_resched();
 770                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 771                 if (!rth)
 772                         continue;
 773
 774                 spin_lock_bh(rt_hash_lock_addr(i));
 775
 776                 list = NULL;
 777                 pprev = &rt_hash_table[i].chain;
 778                 rth = rcu_dereference_protected(*pprev,
 779                         lockdep_is_held(rt_hash_lock_addr(i)));
 780
 781                 while (rth) {
 782                         next = rcu_dereference_protected(rth->dst.rt_next,
 783                                 lockdep_is_held(rt_hash_lock_addr(i)));
 784
 785                         if (!net ||
 786                             net_eq(dev_net(rth->dst.dev), net)) {
 787                                 rcu_assign_pointer(*pprev, next);
 788                                 rcu_assign_pointer(rth->dst.rt_next, list);
 789                                 list = rth;
 790                         } else {
 791                                 pprev = &rth->dst.rt_next;
 792                         }
 793                         rth = next;
 794                 }
 795
 796                 spin_unlock_bh(rt_hash_lock_addr(i));
 797
 798                 for (; list; list = next) {
 799                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 800                         rt_free(list);
 801                 }
 802         }
 803 }
 804
 805 /*
 806  * While freeing expired entries, we compute average chain length
 807  * and standard deviation, using fixed-point arithmetic.
 808  * This to have an estimation of rt_chain_length_max
 809  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 810  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 811  */
 812
 813 #define FRACT_BITS 3
 814 #define ONE (1UL << FRACT_BITS)
 815
 816 /*
 817  * Given a hash chain and an item in this hash chain,
 818  * find if a previous entry has the same hash_inputs
 819  * (but differs on tos, mark or oif)
 820  * Returns 0 if an alias is found.
 821  * Returns ONE if rth has no alias before itself.
 822  */
 823 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 824 {
 825         const struct rtable *aux = head;
 826
 827         while (aux != rth) {
 828                 if (compare_hash_inputs(aux, rth))
 829                         return 0;
 830                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 831         }
 832         return ONE;
 833 }
 834
 835 static void rt_check_expire(void)
 836 {
 837         static unsigned int rover;
 838         unsigned int i = rover, goal;
 839         struct rtable *rth;
 840         struct rtable __rcu **rthp;
 841         unsigned long samples = 0;
 842         unsigned long sum = 0, sum2 = 0;
 843         unsigned long delta;
 844         u64 mult;
 845
 846         delta = jiffies - expires_ljiffies;
 847         expires_ljiffies = jiffies;
 848         mult = ((u64)delta) << rt_hash_log;
 849         if (ip_rt_gc_timeout > 1)
 850                 do_div(mult, ip_rt_gc_timeout);
 851         goal = (unsigned int)mult;
 852         if (goal > rt_hash_mask)
 853                 goal = rt_hash_mask + 1;
 854         for (; goal > 0; goal--) {
 855                 unsigned long tmo = ip_rt_gc_timeout;
 856                 unsigned long length;
 857
 858                 i = (i + 1) & rt_hash_mask;
 859                 rthp = &rt_hash_table[i].chain;
 860
 861                 if (need_resched())
 862                         cond_resched();
 863
 864                 samples++;
 865
 866                 if (rcu_dereference_raw(*rthp) == NULL)
 867                         continue;
 868                 length = 0;
 869                 spin_lock_bh(rt_hash_lock_addr(i));
 870                 while ((rth = rcu_dereference_protected(*rthp,
 871                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 872                         prefetch(rth->dst.rt_next);
 873                         if (rt_is_expired(rth)) {
 874                                 *rthp = rth->dst.rt_next;
 875                                 rt_free(rth);
 876                                 continue;
 877                         }
 878                         if (rth->dst.expires) {
 879                                 /* Entry is expired even if it is in use */
 880                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 881 nofree:
 882                                         tmo >>= 1;
 883                                         rthp = &rth->dst.rt_next;
 884                                         /*
 885                                          * We only count entries on
 886                                          * a chain with equal hash inputs once
 887                                          * so that entries for different QOS
 888                                          * levels, and other non-hash input
 889                                          * attributes don't unfairly skew
 890                                          * the length computation
 891                                          */
 892                                         length += has_noalias(rt_hash_table[i].chain, rth);
 893                                         continue;
 894                                 }
 895                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 896                                 goto nofree;
 897
 898                         /* Cleanup aged off entries. */
 899                         *rthp = rth->dst.rt_next;
 900                         rt_free(rth);
 901                 }
 902                 spin_unlock_bh(rt_hash_lock_addr(i));
 903                 sum += length;
 904                 sum2 += length*length;
 905         }
 906         if (samples) {
 907                 unsigned long avg = sum / samples;
 908                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 909                 rt_chain_length_max = max_t(unsigned long,
 910                                         ip_rt_gc_elasticity,
 911                                         (avg + 4*sd) >> FRACT_BITS);
 912         }
 913         rover = i;
 914 }
 915
 916 /*
 917  * rt_worker_func() is run in process context.
 918  * we call rt_check_expire() to scan part of the hash table
 919  */
 920 static void rt_worker_func(struct work_struct *work)
 921 {
 922         rt_check_expire();
 923         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 924 }
 925
 926 /*
 927  * Perturbation of rt_genid by a small quantity [1..256]
 928  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 929  * many times (2^24) without giving recent rt_genid.
 930  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 931  */
 932 static void rt_cache_invalidate(struct net *net)
 933 {
 934         unsigned char shuffle;
 935
 936         get_random_bytes(&shuffle, sizeof(shuffle));
 937         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 938         inetpeer_invalidate_family(AF_INET);
 939 }
 940
 941 /*
 942  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 943  * delay >= 0 : invalidate & flush cache (can be long)
 944  */
 945 void rt_cache_flush(struct net *net, int delay)
 946 {
 947         rt_cache_invalidate(net);
 948         if (delay >= 0)
 949                 rt_do_flush(net, !in_softirq());
 950 }
 951
 952 /* Flush previous cache invalidated entries from the cache */
 953 void rt_cache_flush_batch(struct net *net)
 954 {
 955         rt_do_flush(net, !in_softirq());
 956 }
 957
 958 static void rt_emergency_hash_rebuild(struct net *net)
 959 {
 960         net_warn_ratelimited("Route hash chain too long!\n");
 961         rt_cache_invalidate(net);
 962 }
 963
 964 /*
 965    Short description of GC goals.
 966
 967    We want to build algorithm, which will keep routing cache
 968    at some equilibrium point, when number of aged off entries
 969    is kept approximately equal to newly generated ones.
 970
 971    Current expiration strength is variable "expire".
 972    We try to adjust it dynamically, so that if networking
 973    is idle expires is large enough to keep enough of warm entries,
 974    and when load increases it reduces to limit cache size.
 975  */
 976
 977 static int rt_garbage_collect(struct dst_ops *ops)
 978 {
 979         static unsigned long expire = RT_GC_TIMEOUT;
 980         static unsigned long last_gc;
 981         static int rover;
 982         static int equilibrium;
 983         struct rtable *rth;
 984         struct rtable __rcu **rthp;
 985         unsigned long now = jiffies;
 986         int goal;
 987         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 988
 989         /*
 990          * Garbage collection is pretty expensive,
 991          * do not make it too frequently.
 992          */
 993
 994         RT_CACHE_STAT_INC(gc_total);
 995
 996         if (now - last_gc < ip_rt_gc_min_interval &&
 997             entries < ip_rt_max_size) {
 998                 RT_CACHE_STAT_INC(gc_ignored);
 999                 goto out;
1000         }
1001
1002         entries = dst_entries_get_slow(&ipv4_dst_ops);
1003         /* Calculate number of entries, which we want to expire now. */
1004         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1005         if (goal <= 0) {
1006                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1007                         equilibrium = ipv4_dst_ops.gc_thresh;
1008                 goal = entries - equilibrium;
1009                 if (goal > 0) {
1010                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011                         goal = entries - equilibrium;
1012                 }
1013         } else {
1014                 /* We are in dangerous area. Try to reduce cache really
1015                  * aggressively.
1016                  */
1017                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1018                 equilibrium = entries - goal;
1019         }
1020
1021         if (now - last_gc >= ip_rt_gc_min_interval)
1022                 last_gc = now;
1023
1024         if (goal <= 0) {
1025                 equilibrium += goal;
1026                 goto work_done;
1027         }
1028
1029         do {
1030                 int i, k;
1031
1032                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1033                         unsigned long tmo = expire;
1034
1035                         k = (k + 1) & rt_hash_mask;
1036                         rthp = &rt_hash_table[k].chain;
1037                         spin_lock_bh(rt_hash_lock_addr(k));
1038                         while ((rth = rcu_dereference_protected(*rthp,
1039                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1040                                 if (!rt_is_expired(rth) &&
1041                                         !rt_may_expire(rth, tmo, expire)) {
1042                                         tmo >>= 1;
1043                                         rthp = &rth->dst.rt_next;
1044                                         continue;
1045                                 }
1046                                 *rthp = rth->dst.rt_next;
1047                                 rt_free(rth);
1048                                 goal--;
1049                         }
1050                         spin_unlock_bh(rt_hash_lock_addr(k));
1051                         if (goal <= 0)
1052                                 break;
1053                 }
1054                 rover = k;
1055
1056                 if (goal <= 0)
1057                         goto work_done;
1058
1059                 /* Goal is not achieved. We stop process if:
1060
1061                    - if expire reduced to zero. Otherwise, expire is halfed.
1062                    - if table is not full.
1063                    - if we are called from interrupt.
1064                    - jiffies check is just fallback/debug loop breaker.
1065                      We will not spin here for long time in any case.
1066                  */
1067
1068                 RT_CACHE_STAT_INC(gc_goal_miss);
1069
1070                 if (expire == 0)
1071                         break;
1072
1073                 expire >>= 1;
1074
1075                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076                         goto out;
1077         } while (!in_softirq() && time_before_eq(jiffies, now));
1078
1079         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         net_warn_ratelimited("dst cache overflow\n");
1084         RT_CACHE_STAT_INC(gc_dst_overflow);
1085         return 1;
1086
1087 work_done:
1088         expire += ip_rt_gc_min_interval;
1089         if (expire > ip_rt_gc_timeout ||
1090             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1091             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1092                 expire = ip_rt_gc_timeout;
1093 out:    return 0;
1094 }
1095
1096 /*
1097  * Returns number of entries in a hash chain that have different hash_inputs
1098  */
1099 static int slow_chain_length(const struct rtable *head)
1100 {
1101         int length = 0;
1102         const struct rtable *rth = head;
1103
1104         while (rth) {
1105                 length += has_noalias(head, rth);
1106                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1107         }
1108         return length >> FRACT_BITS;
1109 }
1110
1111 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1112 {
1113         static const __be32 inaddr_any = 0;
1114         struct net_device *dev = dst->dev;
1115         const __be32 *pkey = daddr;
1116         const struct rtable *rt;
1117         struct neighbour *n;
1118
1119         rt = (const struct rtable *) dst;
1120
1121         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                 pkey = &inaddr_any;
1123         else if (rt->rt_gateway)
1124                 pkey = (const __be32 *) &rt->rt_gateway;
1125
1126         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1127         if (n)
1128                 return n;
1129         return neigh_create(&arp_tbl, pkey, dev);
1130 }
1131
1132 static int rt_bind_neighbour(struct rtable *rt)
1133 {
1134         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1135         if (IS_ERR(n))
1136                 return PTR_ERR(n);
1137         dst_set_neighbour(&rt->dst, n);
1138
1139         return 0;
1140 }
1141
1142 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1143                                      struct sk_buff *skb, int ifindex)
1144 {
1145         struct rtable   *rth, *cand;
1146         struct rtable __rcu **rthp, **candp;
1147         unsigned long   now;
1148         u32             min_score;
1149         int             chain_length;
1150         int attempts = !in_softirq();
1151
1152 restart:
1153         chain_length = 0;
1154         min_score = ~(u32)0;
1155         cand = NULL;
1156         candp = NULL;
1157         now = jiffies;
1158
1159         if (!rt_caching(dev_net(rt->dst.dev))) {
1160                 /*
1161                  * If we're not caching, just tell the caller we
1162                  * were successful and don't touch the route.  The
1163                  * caller hold the sole reference to the cache entry, and
1164                  * it will be released when the caller is done with it.
1165                  * If we drop it here, the callers have no way to resolve routes
1166                  * when we're not caching.  Instead, just point *rp at rt, so
1167                  * the caller gets a single use out of the route
1168                  * Note that we do rt_free on this new route entry, so that
1169                  * once its refcount hits zero, we are still able to reap it
1170                  * (Thanks Alexey)
1171                  * Note: To avoid expensive rcu stuff for this uncached dst,
1172                  * we set DST_NOCACHE so that dst_release() can free dst without
1173                  * waiting a grace period.
1174                  */
1175
1176                 rt->dst.flags |= DST_NOCACHE;
1177                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                         int err = rt_bind_neighbour(rt);
1179                         if (err) {
1180                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1181                                 ip_rt_put(rt);
1182                                 return ERR_PTR(err);
1183                         }
1184                 }
1185
1186                 goto skip_hashing;
1187         }
1188
1189         rthp = &rt_hash_table[hash].chain;
1190
1191         spin_lock_bh(rt_hash_lock_addr(hash));
1192         while ((rth = rcu_dereference_protected(*rthp,
1193                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                 if (rt_is_expired(rth)) {
1195                         *rthp = rth->dst.rt_next;
1196                         rt_free(rth);
1197                         continue;
1198                 }
1199                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                         /* Put it first */
1201                         *rthp = rth->dst.rt_next;
1202                         /*
1203                          * Since lookup is lockfree, the deletion
1204                          * must be visible to another weakly ordered CPU before
1205                          * the insertion at the start of the hash chain.
1206                          */
1207                         rcu_assign_pointer(rth->dst.rt_next,
1208                                            rt_hash_table[hash].chain);
1209                         /*
1210                          * Since lookup is lockfree, the update writes
1211                          * must be ordered for consistency on SMP.
1212                          */
1213                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                         dst_use(&rth->dst, now);
1216                         spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                         rt_drop(rt);
1219                         if (skb)
1220                                 skb_dst_set(skb, &rth->dst);
1221                         return rth;
1222                 }
1223
1224                 if (!atomic_read(&rth->dst.__refcnt)) {
1225                         u32 score = rt_score(rth);
1226
1227                         if (score <= min_score) {
1228                                 cand = rth;
1229                                 candp = rthp;
1230                                 min_score = score;
1231                         }
1232                 }
1233
1234                 chain_length++;
1235
1236                 rthp = &rth->dst.rt_next;
1237         }
1238
1239         if (cand) {
1240                 /* ip_rt_gc_elasticity used to be average length of chain
1241                  * length, when exceeded gc becomes really aggressive.
1242                  *
1243                  * The second limit is less certain. At the moment it allows
1244                  * only 2 entries per bucket. We will see.
1245                  */
1246                 if (chain_length > ip_rt_gc_elasticity) {
1247                         *candp = cand->dst.rt_next;
1248                         rt_free(cand);
1249                 }
1250         } else {
1251                 if (chain_length > rt_chain_length_max &&
1252                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                         struct net *net = dev_net(rt->dst.dev);
1254                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                         if (!rt_caching(net)) {
1256                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1257                                         rt->dst.dev->name, num);
1258                         }
1259                         rt_emergency_hash_rebuild(net);
1260                         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                         ifindex, rt_genid(net));
1264                         goto restart;
1265                 }
1266         }
1267
1268         /* Try to bind route to arp only if it is output
1269            route or unicast forwarding path.
1270          */
1271         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                 int err = rt_bind_neighbour(rt);
1273                 if (err) {
1274                         spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                         if (err != -ENOBUFS) {
1277                                 rt_drop(rt);
1278                                 return ERR_PTR(err);
1279                         }
1280
1281                         /* Neighbour tables are full and nothing
1282                            can be released. Try to shrink route cache,
1283                            it is most likely it holds some neighbour records.
1284                          */
1285                         if (attempts-- > 0) {
1286                                 int saved_elasticity = ip_rt_gc_elasticity;
1287                                 int saved_int = ip_rt_gc_min_interval;
1288                                 ip_rt_gc_elasticity     = 1;
1289                                 ip_rt_gc_min_interval   = 0;
1290                                 rt_garbage_collect(&ipv4_dst_ops);
1291                                 ip_rt_gc_min_interval   = saved_int;
1292                                 ip_rt_gc_elasticity     = saved_elasticity;
1293                                 goto restart;
1294                         }
1295
1296                         net_warn_ratelimited("Neighbour table overflow\n");
1297                         rt_drop(rt);
1298                         return ERR_PTR(-ENOBUFS);
1299                 }
1300         }
1301
1302         rt->dst.rt_next = rt_hash_table[hash].chain;
1303
1304         /*
1305          * Since lookup is lockfree, we must make sure
1306          * previous writes to rt are committed to memory
1307          * before making rt visible to other CPUS.
1308          */
1309         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1310
1311         spin_unlock_bh(rt_hash_lock_addr(hash));
1312
1313 skip_hashing:
1314         if (skb)
1315                 skb_dst_set(skb, &rt->dst);
1316         return rt;
1317 }
1318
1319 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1320
1321 static u32 rt_peer_genid(void)
1322 {
1323         return atomic_read(&__rt_peer_genid);
1324 }
1325
1326 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1327 {
1328         struct inet_peer_base *base;
1329         struct inet_peer *peer;
1330
1331         base = inetpeer_base_ptr(rt->_peer);
1332         if (!base)
1333                 return;
1334
1335         peer = inet_getpeer_v4(base, daddr, create);
1336         if (peer) {
1337                 if (!rt_set_peer(rt, peer))
1338                         inet_putpeer(peer);
1339                 else
1340                         rt->rt_peer_genid = rt_peer_genid();
1341         }
1342 }
1343
1344 /*
1345  * Peer allocation may fail only in serious out-of-memory conditions.  However
1346  * we still can generate some output.
1347  * Random ID selection looks a bit dangerous because we have no chances to
1348  * select ID being unique in a reasonable period of time.
1349  * But broken packet identifier may be better than no packet at all.
1350  */
1351 static void ip_select_fb_ident(struct iphdr *iph)
1352 {
1353         static DEFINE_SPINLOCK(ip_fb_id_lock);
1354         static u32 ip_fallback_id;
1355         u32 salt;
1356
1357         spin_lock_bh(&ip_fb_id_lock);
1358         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1359         iph->id = htons(salt & 0xFFFF);
1360         ip_fallback_id = salt;
1361         spin_unlock_bh(&ip_fb_id_lock);
1362 }
1363
1364 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1365 {
1366         struct rtable *rt = (struct rtable *) dst;
1367
1368         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1369                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1370
1371                 /* If peer is attached to destination, it is never detached,
1372                    so that we need not to grab a lock to dereference it.
1373                  */
1374                 if (peer) {
1375                         iph->id = htons(inet_getid(peer, more));
1376                         return;
1377                 }
1378         } else if (!rt)
1379                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1380
1381         ip_select_fb_ident(iph);
1382 }
1383 EXPORT_SYMBOL(__ip_select_ident);
1384
1385 static void rt_del(unsigned int hash, struct rtable *rt)
1386 {
1387         struct rtable __rcu **rthp;
1388         struct rtable *aux;
1389
1390         rthp = &rt_hash_table[hash].chain;
1391         spin_lock_bh(rt_hash_lock_addr(hash));
1392         ip_rt_put(rt);
1393         while ((aux = rcu_dereference_protected(*rthp,
1394                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1395                 if (aux == rt || rt_is_expired(aux)) {
1396                         *rthp = aux->dst.rt_next;
1397                         rt_free(aux);
1398                         continue;
1399                 }
1400                 rthp = &aux->dst.rt_next;
1401         }
1402         spin_unlock_bh(rt_hash_lock_addr(hash));
1403 }
1404
1405 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1406 {
1407         struct rtable *rt = (struct rtable *) dst;
1408         __be32 orig_gw = rt->rt_gateway;
1409         struct neighbour *n, *old_n;
1410
1411         dst_confirm(&rt->dst);
1412
1413         rt->rt_gateway = peer->redirect_learned.a4;
1414
1415         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1416         if (IS_ERR(n)) {
1417                 rt->rt_gateway = orig_gw;
1418                 return;
1419         }
1420         old_n = xchg(&rt->dst._neighbour, n);
1421         if (old_n)
1422                 neigh_release(old_n);
1423         if (!(n->nud_state & NUD_VALID)) {
1424                 neigh_event_send(n, NULL);
1425         } else {
1426                 rt->rt_flags |= RTCF_REDIRECTED;
1427                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1428         }
1429 }
1430
1431 /* called in rcu_read_lock() section */
1432 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1433                     __be32 saddr, struct net_device *dev)
1434 {
1435         int s, i;
1436         struct in_device *in_dev = __in_dev_get_rcu(dev);
1437         __be32 skeys[2] = { saddr, 0 };
1438         int    ikeys[2] = { dev->ifindex, 0 };
1439         struct inet_peer *peer;
1440         struct net *net;
1441
1442         if (!in_dev)
1443                 return;
1444
1445         net = dev_net(dev);
1446         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1447             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1448             ipv4_is_zeronet(new_gw))
1449                 goto reject_redirect;
1450
1451         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1452                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1453                         goto reject_redirect;
1454                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1455                         goto reject_redirect;
1456         } else {
1457                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1458                         goto reject_redirect;
1459         }
1460
1461         for (s = 0; s < 2; s++) {
1462                 for (i = 0; i < 2; i++) {
1463                         unsigned int hash;
1464                         struct rtable __rcu **rthp;
1465                         struct rtable *rt;
1466
1467                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1468
1469                         rthp = &rt_hash_table[hash].chain;
1470
1471                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1472                                 rthp = &rt->dst.rt_next;
1473
1474                                 if (rt->rt_key_dst != daddr ||
1475                                     rt->rt_key_src != skeys[s] ||
1476                                     rt->rt_oif != ikeys[i] ||
1477                                     rt_is_input_route(rt) ||
1478                                     rt_is_expired(rt) ||
1479                                     !net_eq(dev_net(rt->dst.dev), net) ||
1480                                     rt->dst.error ||
1481                                     rt->dst.dev != dev ||
1482                                     rt->rt_gateway != old_gw)
1483                                         continue;
1484
1485                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1486                                 if (peer) {
1487                                         if (peer->redirect_learned.a4 != new_gw) {
1488                                                 peer->redirect_learned.a4 = new_gw;
1489                                                 atomic_inc(&__rt_peer_genid);
1490                                         }
1491                                         check_peer_redir(&rt->dst, peer);
1492                                 }
1493                         }
1494                 }
1495         }
1496         return;
1497
1498 reject_redirect:
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500         if (IN_DEV_LOG_MARTIANS(in_dev))
1501                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1502                                      "  Advised path = %pI4 -> %pI4\n",
1503                                      &old_gw, dev->name, &new_gw,
1504                                      &saddr, &daddr);
1505 #endif
1506         ;
1507 }
1508
1509 static bool peer_pmtu_expired(struct inet_peer *peer)
1510 {
1511         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1512
1513         return orig &&
1514                time_after_eq(jiffies, orig) &&
1515                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1516 }
1517
1518 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1519 {
1520         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522         return orig &&
1523                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524 }
1525
1526 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1527 {
1528         struct rtable *rt = (struct rtable *)dst;
1529         struct dst_entry *ret = dst;
1530
1531         if (rt) {
1532                 if (dst->obsolete > 0) {
1533                         ip_rt_put(rt);
1534                         ret = NULL;
1535                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1536                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1537                                                 rt->rt_oif,
1538                                                 rt_genid(dev_net(dst->dev)));
1539                         rt_del(hash, rt);
1540                         ret = NULL;
1541                 } else if (rt_has_peer(rt)) {
1542                         struct inet_peer *peer = rt_peer_ptr(rt);
1543                         if (peer_pmtu_expired(peer))
1544                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1545                 }
1546         }
1547         return ret;
1548 }
1549
1550 /*
1551  * Algorithm:
1552  *      1. The first ip_rt_redirect_number redirects are sent
1553  *         with exponential backoff, then we stop sending them at all,
1554  *         assuming that the host ignores our redirects.
1555  *      2. If we did not see packets requiring redirects
1556  *         during ip_rt_redirect_silence, we assume that the host
1557  *         forgot redirected route and start to send redirects again.
1558  *
1559  * This algorithm is much cheaper and more intelligent than dumb load limiting
1560  * in icmp.c.
1561  *
1562  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1563  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1564  */
1565
1566 void ip_rt_send_redirect(struct sk_buff *skb)
1567 {
1568         struct rtable *rt = skb_rtable(skb);
1569         struct in_device *in_dev;
1570         struct inet_peer *peer;
1571         int log_martians;
1572
1573         rcu_read_lock();
1574         in_dev = __in_dev_get_rcu(rt->dst.dev);
1575         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1576                 rcu_read_unlock();
1577                 return;
1578         }
1579         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1580         rcu_read_unlock();
1581
1582         peer = rt_get_peer_create(rt, rt->rt_dst);
1583         if (!peer) {
1584                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1585                 return;
1586         }
1587
1588         /* No redirected packets during ip_rt_redirect_silence;
1589          * reset the algorithm.
1590          */
1591         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1592                 peer->rate_tokens = 0;
1593
1594         /* Too many ignored redirects; do not send anything
1595          * set dst.rate_last to the last seen redirected packet.
1596          */
1597         if (peer->rate_tokens >= ip_rt_redirect_number) {
1598                 peer->rate_last = jiffies;
1599                 return;
1600         }
1601
1602         /* Check for load limit; set rate_last to the latest sent
1603          * redirect.
1604          */
1605         if (peer->rate_tokens == 0 ||
1606             time_after(jiffies,
1607                        (peer->rate_last +
1608                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1609                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1610                 peer->rate_last = jiffies;
1611                 ++peer->rate_tokens;
1612 #ifdef CONFIG_IP_ROUTE_VERBOSE
1613                 if (log_martians &&
1614                     peer->rate_tokens == ip_rt_redirect_number)
1615                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1616                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1617                                              &rt->rt_dst, &rt->rt_gateway);
1618 #endif
1619         }
1620 }
1621
1622 static int ip_error(struct sk_buff *skb)
1623 {
1624         struct rtable *rt = skb_rtable(skb);
1625         struct inet_peer *peer;
1626         unsigned long now;
1627         bool send;
1628         int code;
1629
1630         switch (rt->dst.error) {
1631         case EINVAL:
1632         default:
1633                 goto out;
1634         case EHOSTUNREACH:
1635                 code = ICMP_HOST_UNREACH;
1636                 break;
1637         case ENETUNREACH:
1638                 code = ICMP_NET_UNREACH;
1639                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1640                                 IPSTATS_MIB_INNOROUTES);
1641                 break;
1642         case EACCES:
1643                 code = ICMP_PKT_FILTERED;
1644                 break;
1645         }
1646
1647         peer = rt_get_peer_create(rt, rt->rt_dst);
1648
1649         send = true;
1650         if (peer) {
1651                 now = jiffies;
1652                 peer->rate_tokens += now - peer->rate_last;
1653                 if (peer->rate_tokens > ip_rt_error_burst)
1654                         peer->rate_tokens = ip_rt_error_burst;
1655                 peer->rate_last = now;
1656                 if (peer->rate_tokens >= ip_rt_error_cost)
1657                         peer->rate_tokens -= ip_rt_error_cost;
1658                 else
1659                         send = false;
1660         }
1661         if (send)
1662                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1663
1664 out:    kfree_skb(skb);
1665         return 0;
1666 }
1667
1668 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1669 {
1670         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1671
1672         if (!expires)
1673                 return;
1674         if (time_before(jiffies, expires)) {
1675                 u32 orig_dst_mtu = dst_mtu(dst);
1676                 if (peer->pmtu_learned < orig_dst_mtu) {
1677                         if (!peer->pmtu_orig)
1678                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1679                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1680                 }
1681         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1682                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1683 }
1684
1685 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1686 {
1687         struct rtable *rt = (struct rtable *) dst;
1688         struct inet_peer *peer;
1689
1690         dst_confirm(dst);
1691
1692         peer = rt_get_peer_create(rt, rt->rt_dst);
1693         if (peer) {
1694                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1695
1696                 if (mtu < ip_rt_min_pmtu)
1697                         mtu = ip_rt_min_pmtu;
1698                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1699
1700                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1701                         if (!pmtu_expires)
1702                                 pmtu_expires = 1UL;
1703
1704                         peer->pmtu_learned = mtu;
1705                         peer->pmtu_expires = pmtu_expires;
1706
1707                         atomic_inc(&__rt_peer_genid);
1708                         rt->rt_peer_genid = rt_peer_genid();
1709                 }
1710                 check_peer_pmtu(dst, peer);
1711         }
1712 }
1713
1714
1715 static void ipv4_validate_peer(struct rtable *rt)
1716 {
1717         if (rt->rt_peer_genid != rt_peer_genid()) {
1718                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1719
1720                 if (peer) {
1721                         check_peer_pmtu(&rt->dst, peer);
1722
1723                         if (peer->redirect_learned.a4 &&
1724                             peer->redirect_learned.a4 != rt->rt_gateway)
1725                                 check_peer_redir(&rt->dst, peer);
1726                 }
1727
1728                 rt->rt_peer_genid = rt_peer_genid();
1729         }
1730 }
1731
1732 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1733 {
1734         struct rtable *rt = (struct rtable *) dst;
1735
1736         if (rt_is_expired(rt))
1737                 return NULL;
1738         ipv4_validate_peer(rt);
1739         return dst;
1740 }
1741
1742 static void ipv4_dst_destroy(struct dst_entry *dst)
1743 {
1744         struct rtable *rt = (struct rtable *) dst;
1745
1746         if (rt->fi) {
1747                 fib_info_put(rt->fi);
1748                 rt->fi = NULL;
1749         }
1750         if (rt_has_peer(rt)) {
1751                 struct inet_peer *peer = rt_peer_ptr(rt);
1752                 inet_putpeer(peer);
1753         }
1754 }
1755
1756
1757 static void ipv4_link_failure(struct sk_buff *skb)
1758 {
1759         struct rtable *rt;
1760
1761         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1762
1763         rt = skb_rtable(skb);
1764         if (rt && rt_has_peer(rt)) {
1765                 struct inet_peer *peer = rt_peer_ptr(rt);
1766                 if (peer_pmtu_cleaned(peer))
1767                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1768         }
1769 }
1770
1771 static int ip_rt_bug(struct sk_buff *skb)
1772 {
1773         pr_debug("%s: %pI4 -> %pI4, %s\n",
1774                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1775                  skb->dev ? skb->dev->name : "?");
1776         kfree_skb(skb);
1777         WARN_ON(1);
1778         return 0;
1779 }
1780
1781 /*
1782    We do not cache source address of outgoing interface,
1783    because it is used only by IP RR, TS and SRR options,
1784    so that it out of fast path.
1785
1786    BTW remember: "addr" is allowed to be not aligned
1787    in IP options!
1788  */
1789
1790 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1791 {
1792         __be32 src;
1793
1794         if (rt_is_output_route(rt))
1795                 src = ip_hdr(skb)->saddr;
1796         else {
1797                 struct fib_result res;
1798                 struct flowi4 fl4;
1799                 struct iphdr *iph;
1800
1801                 iph = ip_hdr(skb);
1802
1803                 memset(&fl4, 0, sizeof(fl4));
1804                 fl4.daddr = iph->daddr;
1805                 fl4.saddr = iph->saddr;
1806                 fl4.flowi4_tos = RT_TOS(iph->tos);
1807                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1808                 fl4.flowi4_iif = skb->dev->ifindex;
1809                 fl4.flowi4_mark = skb->mark;
1810
1811                 rcu_read_lock();
1812                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1813                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1814                 else
1815                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1816                                         RT_SCOPE_UNIVERSE);
1817                 rcu_read_unlock();
1818         }
1819         memcpy(addr, &src, 4);
1820 }
1821
1822 #ifdef CONFIG_IP_ROUTE_CLASSID
1823 static void set_class_tag(struct rtable *rt, u32 tag)
1824 {
1825         if (!(rt->dst.tclassid & 0xFFFF))
1826                 rt->dst.tclassid |= tag & 0xFFFF;
1827         if (!(rt->dst.tclassid & 0xFFFF0000))
1828                 rt->dst.tclassid |= tag & 0xFFFF0000;
1829 }
1830 #endif
1831
1832 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1833 {
1834         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1835
1836         if (advmss == 0) {
1837                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1838                                ip_rt_min_advmss);
1839                 if (advmss > 65535 - 40)
1840                         advmss = 65535 - 40;
1841         }
1842         return advmss;
1843 }
1844
1845 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1846 {
1847         const struct rtable *rt = (const struct rtable *) dst;
1848         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1849
1850         if (mtu && rt_is_output_route(rt))
1851                 return mtu;
1852
1853         mtu = dst->dev->mtu;
1854
1855         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1856
1857                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1858                         mtu = 576;
1859         }
1860
1861         if (mtu > IP_MAX_MTU)
1862                 mtu = IP_MAX_MTU;
1863
1864         return mtu;
1865 }
1866
1867 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1868                             struct fib_info *fi)
1869 {
1870         struct inet_peer_base *base;
1871         struct inet_peer *peer;
1872         int create = 0;
1873
1874         /* If a peer entry exists for this destination, we must hook
1875          * it up in order to get at cached metrics.
1876          */
1877         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1878                 create = 1;
1879
1880         base = inetpeer_base_ptr(rt->_peer);
1881         BUG_ON(!base);
1882
1883         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1884         if (peer) {
1885                 __rt_set_peer(rt, peer);
1886                 rt->rt_peer_genid = rt_peer_genid();
1887                 if (inet_metrics_new(peer))
1888                         memcpy(peer->metrics, fi->fib_metrics,
1889                                sizeof(u32) * RTAX_MAX);
1890                 dst_init_metrics(&rt->dst, peer->metrics, false);
1891
1892                 check_peer_pmtu(&rt->dst, peer);
1893
1894                 if (peer->redirect_learned.a4 &&
1895                     peer->redirect_learned.a4 != rt->rt_gateway) {
1896                         rt->rt_gateway = peer->redirect_learned.a4;
1897                         rt->rt_flags |= RTCF_REDIRECTED;
1898                 }
1899         } else {
1900                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1901                         rt->fi = fi;
1902                         atomic_inc(&fi->fib_clntref);
1903                 }
1904                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1905         }
1906 }
1907
1908 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1909                            const struct fib_result *res,
1910                            struct fib_info *fi, u16 type, u32 itag)
1911 {
1912         struct dst_entry *dst = &rt->dst;
1913
1914         if (fi) {
1915                 if (FIB_RES_GW(*res) &&
1916                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1917                         rt->rt_gateway = FIB_RES_GW(*res);
1918                 rt_init_metrics(rt, fl4, fi);
1919 #ifdef CONFIG_IP_ROUTE_CLASSID
1920                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1921 #endif
1922         }
1923
1924         if (dst_mtu(dst) > IP_MAX_MTU)
1925                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1926         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1927                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1928
1929 #ifdef CONFIG_IP_ROUTE_CLASSID
1930 #ifdef CONFIG_IP_MULTIPLE_TABLES
1931         set_class_tag(rt, fib_rules_tclass(res));
1932 #endif
1933         set_class_tag(rt, itag);
1934 #endif
1935 }
1936
1937 static struct rtable *rt_dst_alloc(struct net_device *dev,
1938                                    bool nopolicy, bool noxfrm)
1939 {
1940         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1941                          DST_HOST |
1942                          (nopolicy ? DST_NOPOLICY : 0) |
1943                          (noxfrm ? DST_NOXFRM : 0));
1944 }
1945
1946 /* called in rcu_read_lock() section */
1947 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1948                                 u8 tos, struct net_device *dev, int our)
1949 {
1950         unsigned int hash;
1951         struct rtable *rth;
1952         __be32 spec_dst;
1953         struct in_device *in_dev = __in_dev_get_rcu(dev);
1954         u32 itag = 0;
1955         int err;
1956
1957         /* Primary sanity checks. */
1958
1959         if (in_dev == NULL)
1960                 return -EINVAL;
1961
1962         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1963             skb->protocol != htons(ETH_P_IP))
1964                 goto e_inval;
1965
1966         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1967                 if (ipv4_is_loopback(saddr))
1968                         goto e_inval;
1969
1970         if (ipv4_is_zeronet(saddr)) {
1971                 if (!ipv4_is_local_multicast(daddr))
1972                         goto e_inval;
1973                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1974         } else {
1975                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1976                                           &itag);
1977                 if (err < 0)
1978                         goto e_err;
1979         }
1980         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1981                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1982         if (!rth)
1983                 goto e_nobufs;
1984
1985 #ifdef CONFIG_IP_ROUTE_CLASSID
1986         rth->dst.tclassid = itag;
1987 #endif
1988         rth->dst.output = ip_rt_bug;
1989
1990         rth->rt_key_dst = daddr;
1991         rth->rt_key_src = saddr;
1992         rth->rt_genid   = rt_genid(dev_net(dev));
1993         rth->rt_flags   = RTCF_MULTICAST;
1994         rth->rt_type    = RTN_MULTICAST;
1995         rth->rt_key_tos = tos;
1996         rth->rt_dst     = daddr;
1997         rth->rt_src     = saddr;
1998         rth->rt_route_iif = dev->ifindex;
1999         rth->rt_iif     = dev->ifindex;
2000         rth->rt_oif     = 0;
2001         rth->rt_mark    = skb->mark;
2002         rth->rt_gateway = daddr;
2003         rth->rt_spec_dst= spec_dst;
2004         rth->rt_peer_genid = 0;
2005         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2006         rth->fi = NULL;
2007         if (our) {
2008                 rth->dst.input= ip_local_deliver;
2009                 rth->rt_flags |= RTCF_LOCAL;
2010         }
2011
2012 #ifdef CONFIG_IP_MROUTE
2013         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2014                 rth->dst.input = ip_mr_input;
2015 #endif
2016         RT_CACHE_STAT_INC(in_slow_mc);
2017
2018         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2019         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2020         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2021
2022 e_nobufs:
2023         return -ENOBUFS;
2024 e_inval:
2025         return -EINVAL;
2026 e_err:
2027         return err;
2028 }
2029
2030
2031 static void ip_handle_martian_source(struct net_device *dev,
2032                                      struct in_device *in_dev,
2033                                      struct sk_buff *skb,
2034                                      __be32 daddr,
2035                                      __be32 saddr)
2036 {
2037         RT_CACHE_STAT_INC(in_martian_src);
2038 #ifdef CONFIG_IP_ROUTE_VERBOSE
2039         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2040                 /*
2041                  *      RFC1812 recommendation, if source is martian,
2042                  *      the only hint is MAC header.
2043                  */
2044                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2045                         &daddr, &saddr, dev->name);
2046                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2047                         print_hex_dump(KERN_WARNING, "ll header: ",
2048                                        DUMP_PREFIX_OFFSET, 16, 1,
2049                                        skb_mac_header(skb),
2050                                        dev->hard_header_len, true);
2051                 }
2052         }
2053 #endif
2054 }
2055
2056 /* called in rcu_read_lock() section */
2057 static int __mkroute_input(struct sk_buff *skb,
2058                            const struct fib_result *res,
2059                            struct in_device *in_dev,
2060                            __be32 daddr, __be32 saddr, u32 tos,
2061                            struct rtable **result)
2062 {
2063         struct rtable *rth;
2064         int err;
2065         struct in_device *out_dev;
2066         unsigned int flags = 0;
2067         __be32 spec_dst;
2068         u32 itag;
2069
2070         /* get a working reference to the output device */
2071         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2072         if (out_dev == NULL) {
2073                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2074                 return -EINVAL;
2075         }
2076
2077
2078         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2079                                   in_dev->dev, &spec_dst, &itag);
2080         if (err < 0) {
2081                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2082                                          saddr);
2083
2084                 goto cleanup;
2085         }
2086
2087         if (err)
2088                 flags |= RTCF_DIRECTSRC;
2089
2090         if (out_dev == in_dev && err &&
2091             (IN_DEV_SHARED_MEDIA(out_dev) ||
2092              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2093                 flags |= RTCF_DOREDIRECT;
2094
2095         if (skb->protocol != htons(ETH_P_IP)) {
2096                 /* Not IP (i.e. ARP). Do not create route, if it is
2097                  * invalid for proxy arp. DNAT routes are always valid.
2098                  *
2099                  * Proxy arp feature have been extended to allow, ARP
2100                  * replies back to the same interface, to support
2101                  * Private VLAN switch technologies. See arp.c.
2102                  */
2103                 if (out_dev == in_dev &&
2104                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2105                         err = -EINVAL;
2106                         goto cleanup;
2107                 }
2108         }
2109
2110         rth = rt_dst_alloc(out_dev->dev,
2111                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2112                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2113         if (!rth) {
2114                 err = -ENOBUFS;
2115                 goto cleanup;
2116         }
2117
2118         rth->rt_key_dst = daddr;
2119         rth->rt_key_src = saddr;
2120         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2121         rth->rt_flags = flags;
2122         rth->rt_type = res->type;
2123         rth->rt_key_tos = tos;
2124         rth->rt_dst     = daddr;
2125         rth->rt_src     = saddr;
2126         rth->rt_route_iif = in_dev->dev->ifindex;
2127         rth->rt_iif     = in_dev->dev->ifindex;
2128         rth->rt_oif     = 0;
2129         rth->rt_mark    = skb->mark;
2130         rth->rt_gateway = daddr;
2131         rth->rt_spec_dst= spec_dst;
2132         rth->rt_peer_genid = 0;
2133         rt_init_peer(rth, &res->table->tb_peers);
2134         rth->fi = NULL;
2135
2136         rth->dst.input = ip_forward;
2137         rth->dst.output = ip_output;
2138
2139         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2140
2141         *result = rth;
2142         err = 0;
2143  cleanup:
2144         return err;
2145 }
2146
2147 static int ip_mkroute_input(struct sk_buff *skb,
2148                             struct fib_result *res,
2149                             const struct flowi4 *fl4,
2150                             struct in_device *in_dev,
2151                             __be32 daddr, __be32 saddr, u32 tos)
2152 {
2153         struct rtable *rth = NULL;
2154         int err;
2155         unsigned int hash;
2156
2157 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2158         if (res->fi && res->fi->fib_nhs > 1)
2159                 fib_select_multipath(res);
2160 #endif
2161
2162         /* create a routing cache entry */
2163         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2164         if (err)
2165                 return err;
2166
2167         /* put it into the cache */
2168         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2169                        rt_genid(dev_net(rth->dst.dev)));
2170         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2171         if (IS_ERR(rth))
2172                 return PTR_ERR(rth);
2173         return 0;
2174 }
2175
2176 /*
2177  *      NOTE. We drop all the packets that has local source
2178  *      addresses, because every properly looped back packet
2179  *      must have correct destination already attached by output routine.
2180  *
2181  *      Such approach solves two big problems:
2182  *      1. Not simplex devices are handled properly.
2183  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2184  *      called with rcu_read_lock()
2185  */
2186
2187 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2188                                u8 tos, struct net_device *dev)
2189 {
2190         struct fib_result res;
2191         struct in_device *in_dev = __in_dev_get_rcu(dev);
2192         struct flowi4   fl4;
2193         unsigned int    flags = 0;
2194         u32             itag = 0;
2195         struct rtable   *rth;
2196         unsigned int    hash;
2197         __be32          spec_dst;
2198         int             err = -EINVAL;
2199         struct net    *net = dev_net(dev);
2200
2201         /* IP on this device is disabled. */
2202
2203         if (!in_dev)
2204                 goto out;
2205
2206         /* Check for the most weird martians, which can be not detected
2207            by fib_lookup.
2208          */
2209
2210         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2211                 goto martian_source;
2212
2213         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2214                 goto brd_input;
2215
2216         /* Accept zero addresses only to limited broadcast;
2217          * I even do not know to fix it or not. Waiting for complains :-)
2218          */
2219         if (ipv4_is_zeronet(saddr))
2220                 goto martian_source;
2221
2222         if (ipv4_is_zeronet(daddr))
2223                 goto martian_destination;
2224
2225         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2226                 if (ipv4_is_loopback(daddr))
2227                         goto martian_destination;
2228
2229                 if (ipv4_is_loopback(saddr))
2230                         goto martian_source;
2231         }
2232
2233         /*
2234          *      Now we are ready to route packet.
2235          */
2236         fl4.flowi4_oif = 0;
2237         fl4.flowi4_iif = dev->ifindex;
2238         fl4.flowi4_mark = skb->mark;
2239         fl4.flowi4_tos = tos;
2240         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2241         fl4.daddr = daddr;
2242         fl4.saddr = saddr;
2243         err = fib_lookup(net, &fl4, &res);
2244         if (err != 0) {
2245                 if (!IN_DEV_FORWARD(in_dev))
2246                         goto e_hostunreach;
2247                 goto no_route;
2248         }
2249
2250         RT_CACHE_STAT_INC(in_slow_tot);
2251
2252         if (res.type == RTN_BROADCAST)
2253                 goto brd_input;
2254
2255         if (res.type == RTN_LOCAL) {
2256                 err = fib_validate_source(skb, saddr, daddr, tos,
2257                                           net->loopback_dev->ifindex,
2258                                           dev, &spec_dst, &itag);
2259                 if (err < 0)
2260                         goto martian_source_keep_err;
2261                 if (err)
2262                         flags |= RTCF_DIRECTSRC;
2263                 spec_dst = daddr;
2264                 goto local_input;
2265         }
2266
2267         if (!IN_DEV_FORWARD(in_dev))
2268                 goto e_hostunreach;
2269         if (res.type != RTN_UNICAST)
2270                 goto martian_destination;
2271
2272         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2273 out:    return err;
2274
2275 brd_input:
2276         if (skb->protocol != htons(ETH_P_IP))
2277                 goto e_inval;
2278
2279         if (ipv4_is_zeronet(saddr))
2280                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2281         else {
2282                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2283                                           &itag);
2284                 if (err < 0)
2285                         goto martian_source_keep_err;
2286                 if (err)
2287                         flags |= RTCF_DIRECTSRC;
2288         }
2289         flags |= RTCF_BROADCAST;
2290         res.type = RTN_BROADCAST;
2291         RT_CACHE_STAT_INC(in_brd);
2292
2293 local_input:
2294         rth = rt_dst_alloc(net->loopback_dev,
2295                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2296         if (!rth)
2297                 goto e_nobufs;
2298
2299         rth->dst.input= ip_local_deliver;
2300         rth->dst.output= ip_rt_bug;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302         rth->dst.tclassid = itag;
2303 #endif
2304
2305         rth->rt_key_dst = daddr;
2306         rth->rt_key_src = saddr;
2307         rth->rt_genid = rt_genid(net);
2308         rth->rt_flags   = flags|RTCF_LOCAL;
2309         rth->rt_type    = res.type;
2310         rth->rt_key_tos = tos;
2311         rth->rt_dst     = daddr;
2312         rth->rt_src     = saddr;
2313 #ifdef CONFIG_IP_ROUTE_CLASSID
2314         rth->dst.tclassid = itag;
2315 #endif
2316         rth->rt_route_iif = dev->ifindex;
2317         rth->rt_iif     = dev->ifindex;
2318         rth->rt_oif     = 0;
2319         rth->rt_mark    = skb->mark;
2320         rth->rt_gateway = daddr;
2321         rth->rt_spec_dst= spec_dst;
2322         rth->rt_peer_genid = 0;
2323         rt_init_peer(rth, net->ipv4.peers);
2324         rth->fi = NULL;
2325         if (res.type == RTN_UNREACHABLE) {
2326                 rth->dst.input= ip_error;
2327                 rth->dst.error= -err;
2328                 rth->rt_flags   &= ~RTCF_LOCAL;
2329         }
2330         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2331         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2332         err = 0;
2333         if (IS_ERR(rth))
2334                 err = PTR_ERR(rth);
2335         goto out;
2336
2337 no_route:
2338         RT_CACHE_STAT_INC(in_no_route);
2339         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2340         res.type = RTN_UNREACHABLE;
2341         if (err == -ESRCH)
2342                 err = -ENETUNREACH;
2343         goto local_input;
2344
2345         /*
2346          *      Do not cache martian addresses: they should be logged (RFC1812)
2347          */
2348 martian_destination:
2349         RT_CACHE_STAT_INC(in_martian_dst);
2350 #ifdef CONFIG_IP_ROUTE_VERBOSE
2351         if (IN_DEV_LOG_MARTIANS(in_dev))
2352                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2353                                      &daddr, &saddr, dev->name);
2354 #endif
2355
2356 e_hostunreach:
2357         err = -EHOSTUNREACH;
2358         goto out;
2359
2360 e_inval:
2361         err = -EINVAL;
2362         goto out;
2363
2364 e_nobufs:
2365         err = -ENOBUFS;
2366         goto out;
2367
2368 martian_source:
2369         err = -EINVAL;
2370 martian_source_keep_err:
2371         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2372         goto out;
2373 }
2374
2375 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2376                            u8 tos, struct net_device *dev, bool noref)
2377 {
2378         struct rtable   *rth;
2379         unsigned int    hash;
2380         int iif = dev->ifindex;
2381         struct net *net;
2382         int res;
2383
2384         net = dev_net(dev);
2385
2386         rcu_read_lock();
2387
2388         if (!rt_caching(net))
2389                 goto skip_cache;
2390
2391         tos &= IPTOS_RT_MASK;
2392         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2393
2394         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2395              rth = rcu_dereference(rth->dst.rt_next)) {
2396                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2397                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2398                      (rth->rt_route_iif ^ iif) |
2399                      (rth->rt_key_tos ^ tos)) == 0 &&
2400                     rth->rt_mark == skb->mark &&
2401                     net_eq(dev_net(rth->dst.dev), net) &&
2402                     !rt_is_expired(rth)) {
2403                         ipv4_validate_peer(rth);
2404                         if (noref) {
2405                                 dst_use_noref(&rth->dst, jiffies);
2406                                 skb_dst_set_noref(skb, &rth->dst);
2407                         } else {
2408                                 dst_use(&rth->dst, jiffies);
2409                                 skb_dst_set(skb, &rth->dst);
2410                         }
2411                         RT_CACHE_STAT_INC(in_hit);
2412                         rcu_read_unlock();
2413                         return 0;
2414                 }
2415                 RT_CACHE_STAT_INC(in_hlist_search);
2416         }
2417
2418 skip_cache:
2419         /* Multicast recognition logic is moved from route cache to here.
2420            The problem was that too many Ethernet cards have broken/missing
2421            hardware multicast filters :-( As result the host on multicasting
2422            network acquires a lot of useless route cache entries, sort of
2423            SDR messages from all the world. Now we try to get rid of them.
2424            Really, provided software IP multicast filter is organized
2425            reasonably (at least, hashed), it does not result in a slowdown
2426            comparing with route cache reject entries.
2427            Note, that multicast routers are not affected, because
2428            route cache entry is created eventually.
2429          */
2430         if (ipv4_is_multicast(daddr)) {
2431                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2432
2433                 if (in_dev) {
2434                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2435                                                   ip_hdr(skb)->protocol);
2436                         if (our
2437 #ifdef CONFIG_IP_MROUTE
2438                                 ||
2439                             (!ipv4_is_local_multicast(daddr) &&
2440                              IN_DEV_MFORWARD(in_dev))
2441 #endif
2442                            ) {
2443                                 int res = ip_route_input_mc(skb, daddr, saddr,
2444                                                             tos, dev, our);
2445                                 rcu_read_unlock();
2446                                 return res;
2447                         }
2448                 }
2449                 rcu_read_unlock();
2450                 return -EINVAL;
2451         }
2452         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2453         rcu_read_unlock();
2454         return res;
2455 }
2456 EXPORT_SYMBOL(ip_route_input_common);
2457
2458 /* called with rcu_read_lock() */
2459 static struct rtable *__mkroute_output(const struct fib_result *res,
2460                                        const struct flowi4 *fl4,
2461                                        __be32 orig_daddr, __be32 orig_saddr,
2462                                        int orig_oif, __u8 orig_rtos,
2463                                        struct net_device *dev_out,
2464                                        unsigned int flags)
2465 {
2466         struct fib_info *fi = res->fi;
2467         struct in_device *in_dev;
2468         u16 type = res->type;
2469         struct rtable *rth;
2470
2471         in_dev = __in_dev_get_rcu(dev_out);
2472         if (!in_dev)
2473                 return ERR_PTR(-EINVAL);
2474
2475         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2476                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2477                         return ERR_PTR(-EINVAL);
2478
2479         if (ipv4_is_lbcast(fl4->daddr))
2480                 type = RTN_BROADCAST;
2481         else if (ipv4_is_multicast(fl4->daddr))
2482                 type = RTN_MULTICAST;
2483         else if (ipv4_is_zeronet(fl4->daddr))
2484                 return ERR_PTR(-EINVAL);
2485
2486         if (dev_out->flags & IFF_LOOPBACK)
2487                 flags |= RTCF_LOCAL;
2488
2489         if (type == RTN_BROADCAST) {
2490                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2491                 fi = NULL;
2492         } else if (type == RTN_MULTICAST) {
2493                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2494                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2495                                      fl4->flowi4_proto))
2496                         flags &= ~RTCF_LOCAL;
2497                 /* If multicast route do not exist use
2498                  * default one, but do not gateway in this case.
2499                  * Yes, it is hack.
2500                  */
2501                 if (fi && res->prefixlen < 4)
2502                         fi = NULL;
2503         }
2504
2505         rth = rt_dst_alloc(dev_out,
2506                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2507                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2508         if (!rth)
2509                 return ERR_PTR(-ENOBUFS);
2510
2511         rth->dst.output = ip_output;
2512
2513         rth->rt_key_dst = orig_daddr;
2514         rth->rt_key_src = orig_saddr;
2515         rth->rt_genid = rt_genid(dev_net(dev_out));
2516         rth->rt_flags   = flags;
2517         rth->rt_type    = type;
2518         rth->rt_key_tos = orig_rtos;
2519         rth->rt_dst     = fl4->daddr;
2520         rth->rt_src     = fl4->saddr;
2521         rth->rt_route_iif = 0;
2522         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2523         rth->rt_oif     = orig_oif;
2524         rth->rt_mark    = fl4->flowi4_mark;
2525         rth->rt_gateway = fl4->daddr;
2526         rth->rt_spec_dst= fl4->saddr;
2527         rth->rt_peer_genid = 0;
2528         rt_init_peer(rth, (res->table ?
2529                            &res->table->tb_peers :
2530                            dev_net(dev_out)->ipv4.peers));
2531         rth->fi = NULL;
2532
2533         RT_CACHE_STAT_INC(out_slow_tot);
2534
2535         if (flags & RTCF_LOCAL) {
2536                 rth->dst.input = ip_local_deliver;
2537                 rth->rt_spec_dst = fl4->daddr;
2538         }
2539         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2540                 rth->rt_spec_dst = fl4->saddr;
2541                 if (flags & RTCF_LOCAL &&
2542                     !(dev_out->flags & IFF_LOOPBACK)) {
2543                         rth->dst.output = ip_mc_output;
2544                         RT_CACHE_STAT_INC(out_slow_mc);
2545                 }
2546 #ifdef CONFIG_IP_MROUTE
2547                 if (type == RTN_MULTICAST) {
2548                         if (IN_DEV_MFORWARD(in_dev) &&
2549                             !ipv4_is_local_multicast(fl4->daddr)) {
2550                                 rth->dst.input = ip_mr_input;
2551                                 rth->dst.output = ip_mc_output;
2552                         }
2553                 }
2554 #endif
2555         }
2556
2557         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2558
2559         return rth;
2560 }
2561
2562 /*
2563  * Major route resolver routine.
2564  * called with rcu_read_lock();
2565  */
2566
2567 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2568 {
2569         struct net_device *dev_out = NULL;
2570         __u8 tos = RT_FL_TOS(fl4);
2571         unsigned int flags = 0;
2572         struct fib_result res;
2573         struct rtable *rth;
2574         __be32 orig_daddr;
2575         __be32 orig_saddr;
2576         int orig_oif;
2577
2578         res.fi          = NULL;
2579         res.table       = NULL;
2580 #ifdef CONFIG_IP_MULTIPLE_TABLES
2581         res.r           = NULL;
2582 #endif
2583
2584         orig_daddr = fl4->daddr;
2585         orig_saddr = fl4->saddr;
2586         orig_oif = fl4->flowi4_oif;
2587
2588         fl4->flowi4_iif = net->loopback_dev->ifindex;
2589         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2590         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2591                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2592
2593         rcu_read_lock();
2594         if (fl4->saddr) {
2595                 rth = ERR_PTR(-EINVAL);
2596                 if (ipv4_is_multicast(fl4->saddr) ||
2597                     ipv4_is_lbcast(fl4->saddr) ||
2598                     ipv4_is_zeronet(fl4->saddr))
2599                         goto out;
2600
2601                 /* I removed check for oif == dev_out->oif here.
2602                    It was wrong for two reasons:
2603                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2604                       is assigned to multiple interfaces.
2605                    2. Moreover, we are allowed to send packets with saddr
2606                       of another iface. --ANK
2607                  */
2608
2609                 if (fl4->flowi4_oif == 0 &&
2610                     (ipv4_is_multicast(fl4->daddr) ||
2611                      ipv4_is_lbcast(fl4->daddr))) {
2612                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2613                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2614                         if (dev_out == NULL)
2615                                 goto out;
2616
2617                         /* Special hack: user can direct multicasts
2618                            and limited broadcast via necessary interface
2619                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2620                            This hack is not just for fun, it allows
2621                            vic,vat and friends to work.
2622                            They bind socket to loopback, set ttl to zero
2623                            and expect that it will work.
2624                            From the viewpoint of routing cache they are broken,
2625                            because we are not allowed to build multicast path
2626                            with loopback source addr (look, routing cache
2627                            cannot know, that ttl is zero, so that packet
2628                            will not leave this host and route is valid).
2629                            Luckily, this hack is good workaround.
2630                          */
2631
2632                         fl4->flowi4_oif = dev_out->ifindex;
2633                         goto make_route;
2634                 }
2635
2636                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2637                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2638                         if (!__ip_dev_find(net, fl4->saddr, false))
2639                                 goto out;
2640                 }
2641         }
2642
2643
2644         if (fl4->flowi4_oif) {
2645                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2646                 rth = ERR_PTR(-ENODEV);
2647                 if (dev_out == NULL)
2648                         goto out;
2649
2650                 /* RACE: Check return value of inet_select_addr instead. */
2651                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2652                         rth = ERR_PTR(-ENETUNREACH);
2653                         goto out;
2654                 }
2655                 if (ipv4_is_local_multicast(fl4->daddr) ||
2656                     ipv4_is_lbcast(fl4->daddr)) {
2657                         if (!fl4->saddr)
2658                                 fl4->saddr = inet_select_addr(dev_out, 0,
2659                                                               RT_SCOPE_LINK);
2660                         goto make_route;
2661                 }
2662                 if (fl4->saddr) {
2663                         if (ipv4_is_multicast(fl4->daddr))
2664                                 fl4->saddr = inet_select_addr(dev_out, 0,
2665                                                               fl4->flowi4_scope);
2666                         else if (!fl4->daddr)
2667                                 fl4->saddr = inet_select_addr(dev_out, 0,
2668                                                               RT_SCOPE_HOST);
2669                 }
2670         }
2671
2672         if (!fl4->daddr) {
2673                 fl4->daddr = fl4->saddr;
2674                 if (!fl4->daddr)
2675                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2676                 dev_out = net->loopback_dev;
2677                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2678                 res.type = RTN_LOCAL;
2679                 flags |= RTCF_LOCAL;
2680                 goto make_route;
2681         }
2682
2683         if (fib_lookup(net, fl4, &res)) {
2684                 res.fi = NULL;
2685                 res.table = NULL;
2686                 if (fl4->flowi4_oif) {
2687                         /* Apparently, routing tables are wrong. Assume,
2688                            that the destination is on link.
2689
2690                            WHY? DW.
2691                            Because we are allowed to send to iface
2692                            even if it has NO routes and NO assigned
2693                            addresses. When oif is specified, routing
2694                            tables are looked up with only one purpose:
2695                            to catch if destination is gatewayed, rather than
2696                            direct. Moreover, if MSG_DONTROUTE is set,
2697                            we send packet, ignoring both routing tables
2698                            and ifaddr state. --ANK
2699
2700
2701                            We could make it even if oif is unknown,
2702                            likely IPv6, but we do not.
2703                          */
2704
2705                         if (fl4->saddr == 0)
2706                                 fl4->saddr = inet_select_addr(dev_out, 0,
2707                                                               RT_SCOPE_LINK);
2708                         res.type = RTN_UNICAST;
2709                         goto make_route;
2710                 }
2711                 rth = ERR_PTR(-ENETUNREACH);
2712                 goto out;
2713         }
2714
2715         if (res.type == RTN_LOCAL) {
2716                 if (!fl4->saddr) {
2717                         if (res.fi->fib_prefsrc)
2718                                 fl4->saddr = res.fi->fib_prefsrc;
2719                         else
2720                                 fl4->saddr = fl4->daddr;
2721                 }
2722                 dev_out = net->loopback_dev;
2723                 fl4->flowi4_oif = dev_out->ifindex;
2724                 res.fi = NULL;
2725                 flags |= RTCF_LOCAL;
2726                 goto make_route;
2727         }
2728
2729 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2730         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2731                 fib_select_multipath(&res);
2732         else
2733 #endif
2734         if (!res.prefixlen &&
2735             res.table->tb_num_default > 1 &&
2736             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2737                 fib_select_default(&res);
2738
2739         if (!fl4->saddr)
2740                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2741
2742         dev_out = FIB_RES_DEV(res);
2743         fl4->flowi4_oif = dev_out->ifindex;
2744
2745
2746 make_route:
2747         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2748                                tos, dev_out, flags);
2749         if (!IS_ERR(rth)) {
2750                 unsigned int hash;
2751
2752                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2753                                rt_genid(dev_net(dev_out)));
2754                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2755         }
2756
2757 out:
2758         rcu_read_unlock();
2759         return rth;
2760 }
2761
2762 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2763 {
2764         struct rtable *rth;
2765         unsigned int hash;
2766
2767         if (!rt_caching(net))
2768                 goto slow_output;
2769
2770         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2771
2772         rcu_read_lock_bh();
2773         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2774                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2775                 if (rth->rt_key_dst == flp4->daddr &&
2776                     rth->rt_key_src == flp4->saddr &&
2777                     rt_is_output_route(rth) &&
2778                     rth->rt_oif == flp4->flowi4_oif &&
2779                     rth->rt_mark == flp4->flowi4_mark &&
2780                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2781                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2782                     net_eq(dev_net(rth->dst.dev), net) &&
2783                     !rt_is_expired(rth)) {
2784                         ipv4_validate_peer(rth);
2785                         dst_use(&rth->dst, jiffies);
2786                         RT_CACHE_STAT_INC(out_hit);
2787                         rcu_read_unlock_bh();
2788                         if (!flp4->saddr)
2789                                 flp4->saddr = rth->rt_src;
2790                         if (!flp4->daddr)
2791                                 flp4->daddr = rth->rt_dst;
2792                         return rth;
2793                 }
2794                 RT_CACHE_STAT_INC(out_hlist_search);
2795         }
2796         rcu_read_unlock_bh();
2797
2798 slow_output:
2799         return ip_route_output_slow(net, flp4);
2800 }
2801 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2802
2803 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2804 {
2805         return NULL;
2806 }
2807
2808 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2809 {
2810         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2811
2812         return mtu ? : dst->dev->mtu;
2813 }
2814
2815 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2816 {
2817 }
2818
2819 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2820                                           unsigned long old)
2821 {
2822         return NULL;
2823 }
2824
2825 static struct dst_ops ipv4_dst_blackhole_ops = {
2826         .family                 =       AF_INET,
2827         .protocol               =       cpu_to_be16(ETH_P_IP),
2828         .destroy                =       ipv4_dst_destroy,
2829         .check                  =       ipv4_blackhole_dst_check,
2830         .mtu                    =       ipv4_blackhole_mtu,
2831         .default_advmss         =       ipv4_default_advmss,
2832         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2833         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2834         .neigh_lookup           =       ipv4_neigh_lookup,
2835 };
2836
2837 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2838 {
2839         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2840         struct rtable *ort = (struct rtable *) dst_orig;
2841
2842         if (rt) {
2843                 struct dst_entry *new = &rt->dst;
2844
2845                 new->__use = 1;
2846                 new->input = dst_discard;
2847                 new->output = dst_discard;
2848                 dst_copy_metrics(new, &ort->dst);
2849
2850                 new->dev = ort->dst.dev;
2851                 if (new->dev)
2852                         dev_hold(new->dev);
2853
2854                 rt->rt_key_dst = ort->rt_key_dst;
2855                 rt->rt_key_src = ort->rt_key_src;
2856                 rt->rt_key_tos = ort->rt_key_tos;
2857                 rt->rt_route_iif = ort->rt_route_iif;
2858                 rt->rt_iif = ort->rt_iif;
2859                 rt->rt_oif = ort->rt_oif;
2860                 rt->rt_mark = ort->rt_mark;
2861
2862                 rt->rt_genid = rt_genid(net);
2863                 rt->rt_flags = ort->rt_flags;
2864                 rt->rt_type = ort->rt_type;
2865                 rt->rt_dst = ort->rt_dst;
2866                 rt->rt_src = ort->rt_src;
2867                 rt->rt_gateway = ort->rt_gateway;
2868                 rt->rt_spec_dst = ort->rt_spec_dst;
2869                 rt_transfer_peer(rt, ort);
2870                 rt->fi = ort->fi;
2871                 if (rt->fi)
2872                         atomic_inc(&rt->fi->fib_clntref);
2873
2874                 dst_free(new);
2875         }
2876
2877         dst_release(dst_orig);
2878
2879         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2880 }
2881
2882 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2883                                     struct sock *sk)
2884 {
2885         struct rtable *rt = __ip_route_output_key(net, flp4);
2886
2887         if (IS_ERR(rt))
2888                 return rt;
2889
2890         if (flp4->flowi4_proto)
2891                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2892                                                    flowi4_to_flowi(flp4),
2893                                                    sk, 0);
2894
2895         return rt;
2896 }
2897 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2898
2899 static int rt_fill_info(struct net *net,
2900                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2901                         int nowait, unsigned int flags)
2902 {
2903         struct rtable *rt = skb_rtable(skb);
2904         struct rtmsg *r;
2905         struct nlmsghdr *nlh;
2906         unsigned long expires = 0;
2907         u32 id = 0, ts = 0, tsage = 0, error;
2908
2909         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2910         if (nlh == NULL)
2911                 return -EMSGSIZE;
2912
2913         r = nlmsg_data(nlh);
2914         r->rtm_family    = AF_INET;
2915         r->rtm_dst_len  = 32;
2916         r->rtm_src_len  = 0;
2917         r->rtm_tos      = rt->rt_key_tos;
2918         r->rtm_table    = RT_TABLE_MAIN;
2919         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2920                 goto nla_put_failure;
2921         r->rtm_type     = rt->rt_type;
2922         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2923         r->rtm_protocol = RTPROT_UNSPEC;
2924         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2925         if (rt->rt_flags & RTCF_NOTIFY)
2926                 r->rtm_flags |= RTM_F_NOTIFY;
2927
2928         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2929                 goto nla_put_failure;
2930         if (rt->rt_key_src) {
2931                 r->rtm_src_len = 32;
2932                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2933                         goto nla_put_failure;
2934         }
2935         if (rt->dst.dev &&
2936             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2937                 goto nla_put_failure;
2938 #ifdef CONFIG_IP_ROUTE_CLASSID
2939         if (rt->dst.tclassid &&
2940             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2941                 goto nla_put_failure;
2942 #endif
2943         if (rt_is_input_route(rt)) {
2944                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2945                         goto nla_put_failure;
2946         } else if (rt->rt_src != rt->rt_key_src) {
2947                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2948                         goto nla_put_failure;
2949         }
2950         if (rt->rt_dst != rt->rt_gateway &&
2951             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2952                 goto nla_put_failure;
2953
2954         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2955                 goto nla_put_failure;
2956
2957         if (rt->rt_mark &&
2958             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2959                 goto nla_put_failure;
2960
2961         error = rt->dst.error;
2962         if (rt_has_peer(rt)) {
2963                 const struct inet_peer *peer = rt_peer_ptr(rt);
2964                 inet_peer_refcheck(peer);
2965                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2966                 if (peer->tcp_ts_stamp) {
2967                         ts = peer->tcp_ts;
2968                         tsage = get_seconds() - peer->tcp_ts_stamp;
2969                 }
2970                 expires = ACCESS_ONCE(peer->pmtu_expires);
2971                 if (expires) {
2972                         if (time_before(jiffies, expires))
2973                                 expires -= jiffies;
2974                         else
2975                                 expires = 0;
2976                 }
2977         }
2978
2979         if (rt_is_input_route(rt)) {
2980 #ifdef CONFIG_IP_MROUTE
2981                 __be32 dst = rt->rt_dst;
2982
2983                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2984                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2985                         int err = ipmr_get_route(net, skb,
2986                                                  rt->rt_src, rt->rt_dst,
2987                                                  r, nowait);
2988                         if (err <= 0) {
2989                                 if (!nowait) {
2990                                         if (err == 0)
2991                                                 return 0;
2992                                         goto nla_put_failure;
2993                                 } else {
2994                                         if (err == -EMSGSIZE)
2995                                                 goto nla_put_failure;
2996                                         error = err;
2997                                 }
2998                         }
2999                 } else
3000 #endif
3001                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3002                                 goto nla_put_failure;
3003         }
3004
3005         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3006                                expires, error) < 0)
3007                 goto nla_put_failure;
3008
3009         return nlmsg_end(skb, nlh);
3010
3011 nla_put_failure:
3012         nlmsg_cancel(skb, nlh);
3013         return -EMSGSIZE;
3014 }
3015
3016 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3017 {
3018         struct net *net = sock_net(in_skb->sk);
3019         struct rtmsg *rtm;
3020         struct nlattr *tb[RTA_MAX+1];
3021         struct rtable *rt = NULL;
3022         __be32 dst = 0;
3023         __be32 src = 0;
3024         u32 iif;
3025         int err;
3026         int mark;
3027         struct sk_buff *skb;
3028
3029         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3030         if (err < 0)
3031                 goto errout;
3032
3033         rtm = nlmsg_data(nlh);
3034
3035         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3036         if (skb == NULL) {
3037                 err = -ENOBUFS;
3038                 goto errout;
3039         }
3040
3041         /* Reserve room for dummy headers, this skb can pass
3042            through good chunk of routing engine.
3043          */
3044         skb_reset_mac_header(skb);
3045         skb_reset_network_header(skb);
3046
3047         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3048         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3049         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3050
3051         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3052         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3053         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3054         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3055
3056         if (iif) {
3057                 struct net_device *dev;
3058
3059                 dev = __dev_get_by_index(net, iif);
3060                 if (dev == NULL) {
3061                         err = -ENODEV;
3062                         goto errout_free;
3063                 }
3064
3065                 skb->protocol   = htons(ETH_P_IP);
3066                 skb->dev        = dev;
3067                 skb->mark       = mark;
3068                 local_bh_disable();
3069                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3070                 local_bh_enable();
3071
3072                 rt = skb_rtable(skb);
3073                 if (err == 0 && rt->dst.error)
3074                         err = -rt->dst.error;
3075         } else {
3076                 struct flowi4 fl4 = {
3077                         .daddr = dst,
3078                         .saddr = src,
3079                         .flowi4_tos = rtm->rtm_tos,
3080                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3081                         .flowi4_mark = mark,
3082                 };
3083                 rt = ip_route_output_key(net, &fl4);
3084
3085                 err = 0;
3086                 if (IS_ERR(rt))
3087                         err = PTR_ERR(rt);
3088         }
3089
3090         if (err)
3091                 goto errout_free;
3092
3093         skb_dst_set(skb, &rt->dst);
3094         if (rtm->rtm_flags & RTM_F_NOTIFY)
3095                 rt->rt_flags |= RTCF_NOTIFY;
3096
3097         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3098                            RTM_NEWROUTE, 0, 0);
3099         if (err <= 0)
3100                 goto errout_free;
3101
3102         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3103 errout:
3104         return err;
3105
3106 errout_free:
3107         kfree_skb(skb);
3108         goto errout;
3109 }
3110
3111 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3112 {
3113         struct rtable *rt;
3114         int h, s_h;
3115         int idx, s_idx;
3116         struct net *net;
3117
3118         net = sock_net(skb->sk);
3119
3120         s_h = cb->args[0];
3121         if (s_h < 0)
3122                 s_h = 0;
3123         s_idx = idx = cb->args[1];
3124         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3125                 if (!rt_hash_table[h].chain)
3126                         continue;
3127                 rcu_read_lock_bh();
3128                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3129                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3130                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3131                                 continue;
3132                         if (rt_is_expired(rt))
3133                                 continue;
3134                         skb_dst_set_noref(skb, &rt->dst);
3135                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3136                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3137                                          1, NLM_F_MULTI) <= 0) {
3138                                 skb_dst_drop(skb);
3139                                 rcu_read_unlock_bh();
3140                                 goto done;
3141                         }
3142                         skb_dst_drop(skb);
3143                 }
3144                 rcu_read_unlock_bh();
3145         }
3146
3147 done:
3148         cb->args[0] = h;
3149         cb->args[1] = idx;
3150         return skb->len;
3151 }
3152
3153 void ip_rt_multicast_event(struct in_device *in_dev)
3154 {
3155         rt_cache_flush(dev_net(in_dev->dev), 0);
3156 }
3157
3158 #ifdef CONFIG_SYSCTL
3159 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3160                                         void __user *buffer,
3161                                         size_t *lenp, loff_t *ppos)
3162 {
3163         if (write) {
3164                 int flush_delay;
3165                 ctl_table ctl;
3166                 struct net *net;
3167
3168                 memcpy(&ctl, __ctl, sizeof(ctl));
3169                 ctl.data = &flush_delay;
3170                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3171
3172                 net = (struct net *)__ctl->extra1;
3173                 rt_cache_flush(net, flush_delay);
3174                 return 0;
3175         }
3176
3177         return -EINVAL;
3178 }
3179
3180 static ctl_table ipv4_route_table[] = {
3181         {
3182                 .procname       = "gc_thresh",
3183                 .data           = &ipv4_dst_ops.gc_thresh,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "max_size",
3190                 .data           = &ip_rt_max_size,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 /*  Deprecated. Use gc_min_interval_ms */
3197
3198                 .procname       = "gc_min_interval",
3199                 .data           = &ip_rt_gc_min_interval,
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0644,
3202                 .proc_handler   = proc_dointvec_jiffies,
3203         },
3204         {
3205                 .procname       = "gc_min_interval_ms",
3206                 .data           = &ip_rt_gc_min_interval,
3207                 .maxlen         = sizeof(int),
3208                 .mode           = 0644,
3209                 .proc_handler   = proc_dointvec_ms_jiffies,
3210         },
3211         {
3212                 .procname       = "gc_timeout",
3213                 .data           = &ip_rt_gc_timeout,
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0644,
3216                 .proc_handler   = proc_dointvec_jiffies,
3217         },
3218         {
3219                 .procname       = "gc_interval",
3220                 .data           = &ip_rt_gc_interval,
3221                 .maxlen         = sizeof(int),
3222                 .mode           = 0644,
3223                 .proc_handler   = proc_dointvec_jiffies,
3224         },
3225         {
3226                 .procname       = "redirect_load",
3227                 .data           = &ip_rt_redirect_load,
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0644,
3230                 .proc_handler   = proc_dointvec,
3231         },
3232         {
3233                 .procname       = "redirect_number",
3234                 .data           = &ip_rt_redirect_number,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = proc_dointvec,
3238         },
3239         {
3240                 .procname       = "redirect_silence",
3241                 .data           = &ip_rt_redirect_silence,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 .procname       = "error_cost",
3248                 .data           = &ip_rt_error_cost,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec,
3252         },
3253         {
3254                 .procname       = "error_burst",
3255                 .data           = &ip_rt_error_burst,
3256                 .maxlen         = sizeof(int),
3257                 .mode           = 0644,
3258                 .proc_handler   = proc_dointvec,
3259         },
3260         {
3261                 .procname       = "gc_elasticity",
3262                 .data           = &ip_rt_gc_elasticity,
3263                 .maxlen         = sizeof(int),
3264                 .mode           = 0644,
3265                 .proc_handler   = proc_dointvec,
3266         },
3267         {
3268                 .procname       = "mtu_expires",
3269                 .data           = &ip_rt_mtu_expires,
3270                 .maxlen         = sizeof(int),
3271                 .mode           = 0644,
3272                 .proc_handler   = proc_dointvec_jiffies,
3273         },
3274         {
3275                 .procname       = "min_pmtu",
3276                 .data           = &ip_rt_min_pmtu,
3277                 .maxlen         = sizeof(int),
3278                 .mode           = 0644,
3279                 .proc_handler   = proc_dointvec,
3280         },
3281         {
3282                 .procname       = "min_adv_mss",
3283                 .data           = &ip_rt_min_advmss,
3284                 .maxlen         = sizeof(int),
3285                 .mode           = 0644,
3286                 .proc_handler   = proc_dointvec,
3287         },
3288         { }
3289 };
3290
3291 static struct ctl_table ipv4_route_flush_table[] = {
3292         {
3293                 .procname       = "flush",
3294                 .maxlen         = sizeof(int),
3295                 .mode           = 0200,
3296                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3297         },
3298         { },
3299 };
3300
3301 static __net_init int sysctl_route_net_init(struct net *net)
3302 {
3303         struct ctl_table *tbl;
3304
3305         tbl = ipv4_route_flush_table;
3306         if (!net_eq(net, &init_net)) {
3307                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3308                 if (tbl == NULL)
3309                         goto err_dup;
3310         }
3311         tbl[0].extra1 = net;
3312
3313         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3314         if (net->ipv4.route_hdr == NULL)
3315                 goto err_reg;
3316         return 0;
3317
3318 err_reg:
3319         if (tbl != ipv4_route_flush_table)
3320                 kfree(tbl);
3321 err_dup:
3322         return -ENOMEM;
3323 }
3324
3325 static __net_exit void sysctl_route_net_exit(struct net *net)
3326 {
3327         struct ctl_table *tbl;
3328
3329         tbl = net->ipv4.route_hdr->ctl_table_arg;
3330         unregister_net_sysctl_table(net->ipv4.route_hdr);
3331         BUG_ON(tbl == ipv4_route_flush_table);
3332         kfree(tbl);
3333 }
3334
3335 static __net_initdata struct pernet_operations sysctl_route_ops = {
3336         .init = sysctl_route_net_init,
3337         .exit = sysctl_route_net_exit,
3338 };
3339 #endif
3340
3341 static __net_init int rt_genid_init(struct net *net)
3342 {
3343         get_random_bytes(&net->ipv4.rt_genid,
3344                          sizeof(net->ipv4.rt_genid));
3345         get_random_bytes(&net->ipv4.dev_addr_genid,
3346                          sizeof(net->ipv4.dev_addr_genid));
3347         return 0;
3348 }
3349
3350 static __net_initdata struct pernet_operations rt_genid_ops = {
3351         .init = rt_genid_init,
3352 };
3353
3354 static int __net_init ipv4_inetpeer_init(struct net *net)
3355 {
3356         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3357
3358         if (!bp)
3359                 return -ENOMEM;
3360         inet_peer_base_init(bp);
3361         net->ipv4.peers = bp;
3362         return 0;
3363 }
3364
3365 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3366 {
3367         struct inet_peer_base *bp = net->ipv4.peers;
3368
3369         net->ipv4.peers = NULL;
3370         inetpeer_invalidate_tree(bp);
3371         kfree(bp);
3372 }
3373
3374 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3375         .init   =       ipv4_inetpeer_init,
3376         .exit   =       ipv4_inetpeer_exit,
3377 };
3378
3379 #ifdef CONFIG_IP_ROUTE_CLASSID
3380 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3381 #endif /* CONFIG_IP_ROUTE_CLASSID */
3382
3383 static __initdata unsigned long rhash_entries;
3384 static int __init set_rhash_entries(char *str)
3385 {
3386         ssize_t ret;
3387
3388         if (!str)
3389                 return 0;
3390
3391         ret = kstrtoul(str, 0, &rhash_entries);
3392         if (ret)
3393                 return 0;
3394
3395         return 1;
3396 }
3397 __setup("rhash_entries=", set_rhash_entries);
3398
3399 int __init ip_rt_init(void)
3400 {
3401         int rc = 0;
3402
3403 #ifdef CONFIG_IP_ROUTE_CLASSID
3404         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3405         if (!ip_rt_acct)
3406                 panic("IP: failed to allocate ip_rt_acct\n");
3407 #endif
3408
3409         ipv4_dst_ops.kmem_cachep =
3410                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3411                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3412
3413         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3414
3415         if (dst_entries_init(&ipv4_dst_ops) < 0)
3416                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3417
3418         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3419                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3420
3421         rt_hash_table = (struct rt_hash_bucket *)
3422                 alloc_large_system_hash("IP route cache",
3423                                         sizeof(struct rt_hash_bucket),
3424                                         rhash_entries,
3425                                         (totalram_pages >= 128 * 1024) ?
3426                                         15 : 17,
3427                                         0,
3428                                         &rt_hash_log,
3429                                         &rt_hash_mask,
3430                                         0,
3431                                         rhash_entries ? 0 : 512 * 1024);
3432         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3433         rt_hash_lock_init();
3434
3435         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3436         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3437
3438         devinet_init();
3439         ip_fib_init();
3440
3441         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3442         expires_ljiffies = jiffies;
3443         schedule_delayed_work(&expires_work,
3444                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3445
3446         if (ip_rt_proc_init())
3447                 pr_err("Unable to create route proc files\n");
3448 #ifdef CONFIG_XFRM
3449         xfrm_init();
3450         xfrm4_init(ip_rt_max_size);
3451 #endif
3452         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3453
3454 #ifdef CONFIG_SYSCTL
3455         register_pernet_subsys(&sysctl_route_ops);
3456 #endif
3457         register_pernet_subsys(&rt_genid_ops);
3458         register_pernet_subsys(&ipv4_inetpeer_ops);
3459         return rc;
3460 }
3461
3462 #ifdef CONFIG_SYSCTL
3463 /*
3464  * We really need to sanitize the damn ipv4 init order, then all
3465  * this nonsense will go away.
3466  */
3467 void __init ip_static_sysctl_init(void)
3468 {
3469         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3470 }
3471 #endif