net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         if (!rt->peer)
 166                 rt_bind_peer(rt, rt->rt_dst, 1);
 167
 168         peer = rt->peer;
 169         if (peer) {
 170                 u32 *old_p = __DST_METRICS_PTR(old);
 171                 unsigned long prev, new;
 172
 173                 p = peer->metrics;
 174                 if (inet_metrics_new(peer))
 175                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 176
 177                 new = (unsigned long) p;
 178                 prev = cmpxchg(&dst->_metrics, old, new);
 179
 180                 if (prev != old) {
 181                         p = __DST_METRICS_PTR(prev);
 182                         if (prev & DST_METRICS_READ_ONLY)
 183                                 p = NULL;
 184                 } else {
 185                         if (rt->fi) {
 186                                 fib_info_put(rt->fi);
 187                                 rt->fi = NULL;
 188                         }
 189                 }
 190         }
 191         return p;
 192 }
 193
 194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 195
 196 static struct dst_ops ipv4_dst_ops = {
 197         .family =               AF_INET,
 198         .protocol =             cpu_to_be16(ETH_P_IP),
 199         .gc =                   rt_garbage_collect,
 200         .check =                ipv4_dst_check,
 201         .default_advmss =       ipv4_default_advmss,
 202         .mtu =                  ipv4_mtu,
 203         .cow_metrics =          ipv4_cow_metrics,
 204         .destroy =              ipv4_dst_destroy,
 205         .ifdown =               ipv4_dst_ifdown,
 206         .negative_advice =      ipv4_negative_advice,
 207         .link_failure =         ipv4_link_failure,
 208         .update_pmtu =          ip_rt_update_pmtu,
 209         .local_out =            __ip_local_out,
 210         .neigh_lookup =         ipv4_neigh_lookup,
 211 };
 212
 213 #define ECN_OR_COST(class)      TC_PRIO_##class
 214
 215 const __u8 ip_tos2prio[16] = {
 216         TC_PRIO_BESTEFFORT,
 217         ECN_OR_COST(BESTEFFORT),
 218         TC_PRIO_BESTEFFORT,
 219         ECN_OR_COST(BESTEFFORT),
 220         TC_PRIO_BULK,
 221         ECN_OR_COST(BULK),
 222         TC_PRIO_BULK,
 223         ECN_OR_COST(BULK),
 224         TC_PRIO_INTERACTIVE,
 225         ECN_OR_COST(INTERACTIVE),
 226         TC_PRIO_INTERACTIVE,
 227         ECN_OR_COST(INTERACTIVE),
 228         TC_PRIO_INTERACTIVE_BULK,
 229         ECN_OR_COST(INTERACTIVE_BULK),
 230         TC_PRIO_INTERACTIVE_BULK,
 231         ECN_OR_COST(INTERACTIVE_BULK)
 232 };
 233 EXPORT_SYMBOL(ip_tos2prio);
 234
 235 /*
 236  * Route cache.
 237  */
 238
 239 /* The locking scheme is rather straight forward:
 240  *
 241  * 1) Read-Copy Update protects the buckets of the central route hash.
 242  * 2) Only writers remove entries, and they hold the lock
 243  *    as they look at rtable reference counts.
 244  * 3) Only readers acquire references to rtable entries,
 245  *    they do so with atomic increments and with the
 246  *    lock held.
 247  */
 248
 249 struct rt_hash_bucket {
 250         struct rtable __rcu     *chain;
 251 };
 252
 253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 254         defined(CONFIG_PROVE_LOCKING)
 255 /*
 256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 257  * The size of this table is a power of two and depends on the number of CPUS.
 258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 259  */
 260 #ifdef CONFIG_LOCKDEP
 261 # define RT_HASH_LOCK_SZ        256
 262 #else
 263 # if NR_CPUS >= 32
 264 #  define RT_HASH_LOCK_SZ       4096
 265 # elif NR_CPUS >= 16
 266 #  define RT_HASH_LOCK_SZ       2048
 267 # elif NR_CPUS >= 8
 268 #  define RT_HASH_LOCK_SZ       1024
 269 # elif NR_CPUS >= 4
 270 #  define RT_HASH_LOCK_SZ       512
 271 # else
 272 #  define RT_HASH_LOCK_SZ       256
 273 # endif
 274 #endif
 275
 276 static spinlock_t       *rt_hash_locks;
 277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 278
 279 static __init void rt_hash_lock_init(void)
 280 {
 281         int i;
 282
 283         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 284                         GFP_KERNEL);
 285         if (!rt_hash_locks)
 286                 panic("IP: failed to allocate rt_hash_locks\n");
 287
 288         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 289                 spin_lock_init(&rt_hash_locks[i]);
 290 }
 291 #else
 292 # define rt_hash_lock_addr(slot) NULL
 293
 294 static inline void rt_hash_lock_init(void)
 295 {
 296 }
 297 #endif
 298
 299 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 300 static unsigned int             rt_hash_mask __read_mostly;
 301 static unsigned int             rt_hash_log  __read_mostly;
 302
 303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 305
 306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 307                                    int genid)
 308 {
 309         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 310                             idx, genid)
 311                 & rt_hash_mask;
 312 }
 313
 314 static inline int rt_genid(struct net *net)
 315 {
 316         return atomic_read(&net->ipv4.rt_genid);
 317 }
 318
 319 #ifdef CONFIG_PROC_FS
 320 struct rt_cache_iter_state {
 321         struct seq_net_private p;
 322         int bucket;
 323         int genid;
 324 };
 325
 326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 327 {
 328         struct rt_cache_iter_state *st = seq->private;
 329         struct rtable *r = NULL;
 330
 331         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 332                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 333                         continue;
 334                 rcu_read_lock_bh();
 335                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 336                 while (r) {
 337                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 338                             r->rt_genid == st->genid)
 339                                 return r;
 340                         r = rcu_dereference_bh(r->dst.rt_next);
 341                 }
 342                 rcu_read_unlock_bh();
 343         }
 344         return r;
 345 }
 346
 347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 348                                           struct rtable *r)
 349 {
 350         struct rt_cache_iter_state *st = seq->private;
 351
 352         r = rcu_dereference_bh(r->dst.rt_next);
 353         while (!r) {
 354                 rcu_read_unlock_bh();
 355                 do {
 356                         if (--st->bucket < 0)
 357                                 return NULL;
 358                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 359                 rcu_read_lock_bh();
 360                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 361         }
 362         return r;
 363 }
 364
 365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 366                                         struct rtable *r)
 367 {
 368         struct rt_cache_iter_state *st = seq->private;
 369         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 370                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 371                         continue;
 372                 if (r->rt_genid == st->genid)
 373                         break;
 374         }
 375         return r;
 376 }
 377
 378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 379 {
 380         struct rtable *r = rt_cache_get_first(seq);
 381
 382         if (r)
 383                 while (pos && (r = rt_cache_get_next(seq, r)))
 384                         --pos;
 385         return pos ? NULL : r;
 386 }
 387
 388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 389 {
 390         struct rt_cache_iter_state *st = seq->private;
 391         if (*pos)
 392                 return rt_cache_get_idx(seq, *pos - 1);
 393         st->genid = rt_genid(seq_file_net(seq));
 394         return SEQ_START_TOKEN;
 395 }
 396
 397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 398 {
 399         struct rtable *r;
 400
 401         if (v == SEQ_START_TOKEN)
 402                 r = rt_cache_get_first(seq);
 403         else
 404                 r = rt_cache_get_next(seq, v);
 405         ++*pos;
 406         return r;
 407 }
 408
 409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 410 {
 411         if (v && v != SEQ_START_TOKEN)
 412                 rcu_read_unlock_bh();
 413 }
 414
 415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 416 {
 417         if (v == SEQ_START_TOKEN)
 418                 seq_printf(seq, "%-127s\n",
 419                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 420                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 421                            "HHUptod\tSpecDst");
 422         else {
 423                 struct rtable *r = v;
 424                 struct neighbour *n;
 425                 int len, HHUptod;
 426
 427                 rcu_read_lock();
 428                 n = dst_get_neighbour_noref(&r->dst);
 429                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 430                 rcu_read_unlock();
 431
 432                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 433                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 434                         r->dst.dev ? r->dst.dev->name : "*",
 435                         (__force u32)r->rt_dst,
 436                         (__force u32)r->rt_gateway,
 437                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 438                         r->dst.__use, 0, (__force u32)r->rt_src,
 439                         dst_metric_advmss(&r->dst) + 40,
 440                         dst_metric(&r->dst, RTAX_WINDOW),
 441                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 442                               dst_metric(&r->dst, RTAX_RTTVAR)),
 443                         r->rt_key_tos,
 444                         -1,
 445                         HHUptod,
 446                         r->rt_spec_dst, &len);
 447
 448                 seq_printf(seq, "%*s\n", 127 - len, "");
 449         }
 450         return 0;
 451 }
 452
 453 static const struct seq_operations rt_cache_seq_ops = {
 454         .start  = rt_cache_seq_start,
 455         .next   = rt_cache_seq_next,
 456         .stop   = rt_cache_seq_stop,
 457         .show   = rt_cache_seq_show,
 458 };
 459
 460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 461 {
 462         return seq_open_net(inode, file, &rt_cache_seq_ops,
 463                         sizeof(struct rt_cache_iter_state));
 464 }
 465
 466 static const struct file_operations rt_cache_seq_fops = {
 467         .owner   = THIS_MODULE,
 468         .open    = rt_cache_seq_open,
 469         .read    = seq_read,
 470         .llseek  = seq_lseek,
 471         .release = seq_release_net,
 472 };
 473
 474
 475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 476 {
 477         int cpu;
 478
 479         if (*pos == 0)
 480                 return SEQ_START_TOKEN;
 481
 482         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 483                 if (!cpu_possible(cpu))
 484                         continue;
 485                 *pos = cpu+1;
 486                 return &per_cpu(rt_cache_stat, cpu);
 487         }
 488         return NULL;
 489 }
 490
 491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 492 {
 493         int cpu;
 494
 495         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 496                 if (!cpu_possible(cpu))
 497                         continue;
 498                 *pos = cpu+1;
 499                 return &per_cpu(rt_cache_stat, cpu);
 500         }
 501         return NULL;
 502
 503 }
 504
 505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 506 {
 507
 508 }
 509
 510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 511 {
 512         struct rt_cache_stat *st = v;
 513
 514         if (v == SEQ_START_TOKEN) {
 515                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 516                 return 0;
 517         }
 518
 519         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 520                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 521                    dst_entries_get_slow(&ipv4_dst_ops),
 522                    st->in_hit,
 523                    st->in_slow_tot,
 524                    st->in_slow_mc,
 525                    st->in_no_route,
 526                    st->in_brd,
 527                    st->in_martian_dst,
 528                    st->in_martian_src,
 529
 530                    st->out_hit,
 531                    st->out_slow_tot,
 532                    st->out_slow_mc,
 533
 534                    st->gc_total,
 535                    st->gc_ignored,
 536                    st->gc_goal_miss,
 537                    st->gc_dst_overflow,
 538                    st->in_hlist_search,
 539                    st->out_hlist_search
 540                 );
 541         return 0;
 542 }
 543
 544 static const struct seq_operations rt_cpu_seq_ops = {
 545         .start  = rt_cpu_seq_start,
 546         .next   = rt_cpu_seq_next,
 547         .stop   = rt_cpu_seq_stop,
 548         .show   = rt_cpu_seq_show,
 549 };
 550
 551
 552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 553 {
 554         return seq_open(file, &rt_cpu_seq_ops);
 555 }
 556
 557 static const struct file_operations rt_cpu_seq_fops = {
 558         .owner   = THIS_MODULE,
 559         .open    = rt_cpu_seq_open,
 560         .read    = seq_read,
 561         .llseek  = seq_lseek,
 562         .release = seq_release,
 563 };
 564
 565 #ifdef CONFIG_IP_ROUTE_CLASSID
 566 static int rt_acct_proc_show(struct seq_file *m, void *v)
 567 {
 568         struct ip_rt_acct *dst, *src;
 569         unsigned int i, j;
 570
 571         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 572         if (!dst)
 573                 return -ENOMEM;
 574
 575         for_each_possible_cpu(i) {
 576                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 577                 for (j = 0; j < 256; j++) {
 578                         dst[j].o_bytes   += src[j].o_bytes;
 579                         dst[j].o_packets += src[j].o_packets;
 580                         dst[j].i_bytes   += src[j].i_bytes;
 581                         dst[j].i_packets += src[j].i_packets;
 582                 }
 583         }
 584
 585         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 586         kfree(dst);
 587         return 0;
 588 }
 589
 590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 591 {
 592         return single_open(file, rt_acct_proc_show, NULL);
 593 }
 594
 595 static const struct file_operations rt_acct_proc_fops = {
 596         .owner          = THIS_MODULE,
 597         .open           = rt_acct_proc_open,
 598         .read           = seq_read,
 599         .llseek         = seq_lseek,
 600         .release        = single_release,
 601 };
 602 #endif
 603
 604 static int __net_init ip_rt_do_proc_init(struct net *net)
 605 {
 606         struct proc_dir_entry *pde;
 607
 608         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 609                         &rt_cache_seq_fops);
 610         if (!pde)
 611                 goto err1;
 612
 613         pde = proc_create("rt_cache", S_IRUGO,
 614                           net->proc_net_stat, &rt_cpu_seq_fops);
 615         if (!pde)
 616                 goto err2;
 617
 618 #ifdef CONFIG_IP_ROUTE_CLASSID
 619         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 620         if (!pde)
 621                 goto err3;
 622 #endif
 623         return 0;
 624
 625 #ifdef CONFIG_IP_ROUTE_CLASSID
 626 err3:
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628 #endif
 629 err2:
 630         remove_proc_entry("rt_cache", net->proc_net);
 631 err1:
 632         return -ENOMEM;
 633 }
 634
 635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 636 {
 637         remove_proc_entry("rt_cache", net->proc_net_stat);
 638         remove_proc_entry("rt_cache", net->proc_net);
 639 #ifdef CONFIG_IP_ROUTE_CLASSID
 640         remove_proc_entry("rt_acct", net->proc_net);
 641 #endif
 642 }
 643
 644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 645         .init = ip_rt_do_proc_init,
 646         .exit = ip_rt_do_proc_exit,
 647 };
 648
 649 static int __init ip_rt_proc_init(void)
 650 {
 651         return register_pernet_subsys(&ip_rt_proc_ops);
 652 }
 653
 654 #else
 655 static inline int ip_rt_proc_init(void)
 656 {
 657         return 0;
 658 }
 659 #endif /* CONFIG_PROC_FS */
 660
 661 static inline void rt_free(struct rtable *rt)
 662 {
 663         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 664 }
 665
 666 static inline void rt_drop(struct rtable *rt)
 667 {
 668         ip_rt_put(rt);
 669         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 670 }
 671
 672 static inline int rt_fast_clean(struct rtable *rth)
 673 {
 674         /* Kill broadcast/multicast entries very aggresively, if they
 675            collide in hash table with more useful entries */
 676         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 677                 rt_is_input_route(rth) && rth->dst.rt_next;
 678 }
 679
 680 static inline int rt_valuable(struct rtable *rth)
 681 {
 682         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 683                 (rth->peer && rth->peer->pmtu_expires);
 684 }
 685
 686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 687 {
 688         unsigned long age;
 689         int ret = 0;
 690
 691         if (atomic_read(&rth->dst.__refcnt))
 692                 goto out;
 693
 694         age = jiffies - rth->dst.lastuse;
 695         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 696             (age <= tmo2 && rt_valuable(rth)))
 697                 goto out;
 698         ret = 1;
 699 out:    return ret;
 700 }
 701
 702 /* Bits of score are:
 703  * 31: very valuable
 704  * 30: not quite useless
 705  * 29..0: usage counter
 706  */
 707 static inline u32 rt_score(struct rtable *rt)
 708 {
 709         u32 score = jiffies - rt->dst.lastuse;
 710
 711         score = ~score & ~(3<<30);
 712
 713         if (rt_valuable(rt))
 714                 score |= (1<<31);
 715
 716         if (rt_is_output_route(rt) ||
 717             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 718                 score |= (1<<30);
 719
 720         return score;
 721 }
 722
 723 static inline bool rt_caching(const struct net *net)
 724 {
 725         return net->ipv4.current_rt_cache_rebuild_count <=
 726                 net->ipv4.sysctl_rt_cache_rebuild_count;
 727 }
 728
 729 static inline bool compare_hash_inputs(const struct rtable *rt1,
 730                                        const struct rtable *rt2)
 731 {
 732         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 733                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 735 }
 736
 737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 740                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 741                 (rt1->rt_mark ^ rt2->rt_mark) |
 742                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 743                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 744                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 745 }
 746
 747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 748 {
 749         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 750 }
 751
 752 static inline int rt_is_expired(struct rtable *rth)
 753 {
 754         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 755 }
 756
 757 /*
 758  * Perform a full scan of hash table and free all entries.
 759  * Can be called by a softirq or a process.
 760  * In the later case, we want to be reschedule if necessary
 761  */
 762 static void rt_do_flush(struct net *net, int process_context)
 763 {
 764         unsigned int i;
 765         struct rtable *rth, *next;
 766
 767         for (i = 0; i <= rt_hash_mask; i++) {
 768                 struct rtable __rcu **pprev;
 769                 struct rtable *list;
 770
 771                 if (process_context && need_resched())
 772                         cond_resched();
 773                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 774                 if (!rth)
 775                         continue;
 776
 777                 spin_lock_bh(rt_hash_lock_addr(i));
 778
 779                 list = NULL;
 780                 pprev = &rt_hash_table[i].chain;
 781                 rth = rcu_dereference_protected(*pprev,
 782                         lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784                 while (rth) {
 785                         next = rcu_dereference_protected(rth->dst.rt_next,
 786                                 lockdep_is_held(rt_hash_lock_addr(i)));
 787
 788                         if (!net ||
 789                             net_eq(dev_net(rth->dst.dev), net)) {
 790                                 rcu_assign_pointer(*pprev, next);
 791                                 rcu_assign_pointer(rth->dst.rt_next, list);
 792                                 list = rth;
 793                         } else {
 794                                 pprev = &rth->dst.rt_next;
 795                         }
 796                         rth = next;
 797                 }
 798
 799                 spin_unlock_bh(rt_hash_lock_addr(i));
 800
 801                 for (; list; list = next) {
 802                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 803                         rt_free(list);
 804                 }
 805         }
 806 }
 807
 808 /*
 809  * While freeing expired entries, we compute average chain length
 810  * and standard deviation, using fixed-point arithmetic.
 811  * This to have an estimation of rt_chain_length_max
 812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 814  */
 815
 816 #define FRACT_BITS 3
 817 #define ONE (1UL << FRACT_BITS)
 818
 819 /*
 820  * Given a hash chain and an item in this hash chain,
 821  * find if a previous entry has the same hash_inputs
 822  * (but differs on tos, mark or oif)
 823  * Returns 0 if an alias is found.
 824  * Returns ONE if rth has no alias before itself.
 825  */
 826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 827 {
 828         const struct rtable *aux = head;
 829
 830         while (aux != rth) {
 831                 if (compare_hash_inputs(aux, rth))
 832                         return 0;
 833                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 834         }
 835         return ONE;
 836 }
 837
 838 static void rt_check_expire(void)
 839 {
 840         static unsigned int rover;
 841         unsigned int i = rover, goal;
 842         struct rtable *rth;
 843         struct rtable __rcu **rthp;
 844         unsigned long samples = 0;
 845         unsigned long sum = 0, sum2 = 0;
 846         unsigned long delta;
 847         u64 mult;
 848
 849         delta = jiffies - expires_ljiffies;
 850         expires_ljiffies = jiffies;
 851         mult = ((u64)delta) << rt_hash_log;
 852         if (ip_rt_gc_timeout > 1)
 853                 do_div(mult, ip_rt_gc_timeout);
 854         goal = (unsigned int)mult;
 855         if (goal > rt_hash_mask)
 856                 goal = rt_hash_mask + 1;
 857         for (; goal > 0; goal--) {
 858                 unsigned long tmo = ip_rt_gc_timeout;
 859                 unsigned long length;
 860
 861                 i = (i + 1) & rt_hash_mask;
 862                 rthp = &rt_hash_table[i].chain;
 863
 864                 if (need_resched())
 865                         cond_resched();
 866
 867                 samples++;
 868
 869                 if (rcu_dereference_raw(*rthp) == NULL)
 870                         continue;
 871                 length = 0;
 872                 spin_lock_bh(rt_hash_lock_addr(i));
 873                 while ((rth = rcu_dereference_protected(*rthp,
 874                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 875                         prefetch(rth->dst.rt_next);
 876                         if (rt_is_expired(rth)) {
 877                                 *rthp = rth->dst.rt_next;
 878                                 rt_free(rth);
 879                                 continue;
 880                         }
 881                         if (rth->dst.expires) {
 882                                 /* Entry is expired even if it is in use */
 883                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 884 nofree:
 885                                         tmo >>= 1;
 886                                         rthp = &rth->dst.rt_next;
 887                                         /*
 888                                          * We only count entries on
 889                                          * a chain with equal hash inputs once
 890                                          * so that entries for different QOS
 891                                          * levels, and other non-hash input
 892                                          * attributes don't unfairly skew
 893                                          * the length computation
 894                                          */
 895                                         length += has_noalias(rt_hash_table[i].chain, rth);
 896                                         continue;
 897                                 }
 898                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 899                                 goto nofree;
 900
 901                         /* Cleanup aged off entries. */
 902                         *rthp = rth->dst.rt_next;
 903                         rt_free(rth);
 904                 }
 905                 spin_unlock_bh(rt_hash_lock_addr(i));
 906                 sum += length;
 907                 sum2 += length*length;
 908         }
 909         if (samples) {
 910                 unsigned long avg = sum / samples;
 911                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 912                 rt_chain_length_max = max_t(unsigned long,
 913                                         ip_rt_gc_elasticity,
 914                                         (avg + 4*sd) >> FRACT_BITS);
 915         }
 916         rover = i;
 917 }
 918
 919 /*
 920  * rt_worker_func() is run in process context.
 921  * we call rt_check_expire() to scan part of the hash table
 922  */
 923 static void rt_worker_func(struct work_struct *work)
 924 {
 925         rt_check_expire();
 926         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 927 }
 928
 929 /*
 930  * Perturbation of rt_genid by a small quantity [1..256]
 931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 932  * many times (2^24) without giving recent rt_genid.
 933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 934  */
 935 static void rt_cache_invalidate(struct net *net)
 936 {
 937         unsigned char shuffle;
 938
 939         get_random_bytes(&shuffle, sizeof(shuffle));
 940         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 941         inetpeer_invalidate_tree(net, AF_INET);
 942 }
 943
 944 /*
 945  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 946  * delay >= 0 : invalidate & flush cache (can be long)
 947  */
 948 void rt_cache_flush(struct net *net, int delay)
 949 {
 950         rt_cache_invalidate(net);
 951         if (delay >= 0)
 952                 rt_do_flush(net, !in_softirq());
 953 }
 954
 955 /* Flush previous cache invalidated entries from the cache */
 956 void rt_cache_flush_batch(struct net *net)
 957 {
 958         rt_do_flush(net, !in_softirq());
 959 }
 960
 961 static void rt_emergency_hash_rebuild(struct net *net)
 962 {
 963         net_warn_ratelimited("Route hash chain too long!\n");
 964         rt_cache_invalidate(net);
 965 }
 966
 967 /*
 968    Short description of GC goals.
 969
 970    We want to build algorithm, which will keep routing cache
 971    at some equilibrium point, when number of aged off entries
 972    is kept approximately equal to newly generated ones.
 973
 974    Current expiration strength is variable "expire".
 975    We try to adjust it dynamically, so that if networking
 976    is idle expires is large enough to keep enough of warm entries,
 977    and when load increases it reduces to limit cache size.
 978  */
 979
 980 static int rt_garbage_collect(struct dst_ops *ops)
 981 {
 982         static unsigned long expire = RT_GC_TIMEOUT;
 983         static unsigned long last_gc;
 984         static int rover;
 985         static int equilibrium;
 986         struct rtable *rth;
 987         struct rtable __rcu **rthp;
 988         unsigned long now = jiffies;
 989         int goal;
 990         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 991
 992         /*
 993          * Garbage collection is pretty expensive,
 994          * do not make it too frequently.
 995          */
 996
 997         RT_CACHE_STAT_INC(gc_total);
 998
 999         if (now - last_gc < ip_rt_gc_min_interval &&
1000             entries < ip_rt_max_size) {
1001                 RT_CACHE_STAT_INC(gc_ignored);
1002                 goto out;
1003         }
1004
1005         entries = dst_entries_get_slow(&ipv4_dst_ops);
1006         /* Calculate number of entries, which we want to expire now. */
1007         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008         if (goal <= 0) {
1009                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010                         equilibrium = ipv4_dst_ops.gc_thresh;
1011                 goal = entries - equilibrium;
1012                 if (goal > 0) {
1013                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014                         goal = entries - equilibrium;
1015                 }
1016         } else {
1017                 /* We are in dangerous area. Try to reduce cache really
1018                  * aggressively.
1019                  */
1020                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021                 equilibrium = entries - goal;
1022         }
1023
1024         if (now - last_gc >= ip_rt_gc_min_interval)
1025                 last_gc = now;
1026
1027         if (goal <= 0) {
1028                 equilibrium += goal;
1029                 goto work_done;
1030         }
1031
1032         do {
1033                 int i, k;
1034
1035                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036                         unsigned long tmo = expire;
1037
1038                         k = (k + 1) & rt_hash_mask;
1039                         rthp = &rt_hash_table[k].chain;
1040                         spin_lock_bh(rt_hash_lock_addr(k));
1041                         while ((rth = rcu_dereference_protected(*rthp,
1042                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043                                 if (!rt_is_expired(rth) &&
1044                                         !rt_may_expire(rth, tmo, expire)) {
1045                                         tmo >>= 1;
1046                                         rthp = &rth->dst.rt_next;
1047                                         continue;
1048                                 }
1049                                 *rthp = rth->dst.rt_next;
1050                                 rt_free(rth);
1051                                 goal--;
1052                         }
1053                         spin_unlock_bh(rt_hash_lock_addr(k));
1054                         if (goal <= 0)
1055                                 break;
1056                 }
1057                 rover = k;
1058
1059                 if (goal <= 0)
1060                         goto work_done;
1061
1062                 /* Goal is not achieved. We stop process if:
1063
1064                    - if expire reduced to zero. Otherwise, expire is halfed.
1065                    - if table is not full.
1066                    - if we are called from interrupt.
1067                    - jiffies check is just fallback/debug loop breaker.
1068                      We will not spin here for long time in any case.
1069                  */
1070
1071                 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073                 if (expire == 0)
1074                         break;
1075
1076                 expire >>= 1;
1077
1078                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079                         goto out;
1080         } while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085                 goto out;
1086         net_warn_ratelimited("dst cache overflow\n");
1087         RT_CACHE_STAT_INC(gc_dst_overflow);
1088         return 1;
1089
1090 work_done:
1091         expire += ip_rt_gc_min_interval;
1092         if (expire > ip_rt_gc_timeout ||
1093             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095                 expire = ip_rt_gc_timeout;
1096 out:    return 0;
1097 }
1098
1099 /*
1100  * Returns number of entries in a hash chain that have different hash_inputs
1101  */
1102 static int slow_chain_length(const struct rtable *head)
1103 {
1104         int length = 0;
1105         const struct rtable *rth = head;
1106
1107         while (rth) {
1108                 length += has_noalias(head, rth);
1109                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110         }
1111         return length >> FRACT_BITS;
1112 }
1113
1114 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115 {
1116         static const __be32 inaddr_any = 0;
1117         struct net_device *dev = dst->dev;
1118         const __be32 *pkey = daddr;
1119         const struct rtable *rt;
1120         struct neighbour *n;
1121
1122         rt = (const struct rtable *) dst;
1123
1124         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125                 pkey = &inaddr_any;
1126         else if (rt->rt_gateway)
1127                 pkey = (const __be32 *) &rt->rt_gateway;
1128
1129         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130         if (n)
1131                 return n;
1132         return neigh_create(&arp_tbl, pkey, dev);
1133 }
1134
1135 static int rt_bind_neighbour(struct rtable *rt)
1136 {
1137         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138         if (IS_ERR(n))
1139                 return PTR_ERR(n);
1140         dst_set_neighbour(&rt->dst, n);
1141
1142         return 0;
1143 }
1144
1145 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1146                                      struct sk_buff *skb, int ifindex)
1147 {
1148         struct rtable   *rth, *cand;
1149         struct rtable __rcu **rthp, **candp;
1150         unsigned long   now;
1151         u32             min_score;
1152         int             chain_length;
1153         int attempts = !in_softirq();
1154
1155 restart:
1156         chain_length = 0;
1157         min_score = ~(u32)0;
1158         cand = NULL;
1159         candp = NULL;
1160         now = jiffies;
1161
1162         if (!rt_caching(dev_net(rt->dst.dev))) {
1163                 /*
1164                  * If we're not caching, just tell the caller we
1165                  * were successful and don't touch the route.  The
1166                  * caller hold the sole reference to the cache entry, and
1167                  * it will be released when the caller is done with it.
1168                  * If we drop it here, the callers have no way to resolve routes
1169                  * when we're not caching.  Instead, just point *rp at rt, so
1170                  * the caller gets a single use out of the route
1171                  * Note that we do rt_free on this new route entry, so that
1172                  * once its refcount hits zero, we are still able to reap it
1173                  * (Thanks Alexey)
1174                  * Note: To avoid expensive rcu stuff for this uncached dst,
1175                  * we set DST_NOCACHE so that dst_release() can free dst without
1176                  * waiting a grace period.
1177                  */
1178
1179                 rt->dst.flags |= DST_NOCACHE;
1180                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181                         int err = rt_bind_neighbour(rt);
1182                         if (err) {
1183                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1184                                 ip_rt_put(rt);
1185                                 return ERR_PTR(err);
1186                         }
1187                 }
1188
1189                 goto skip_hashing;
1190         }
1191
1192         rthp = &rt_hash_table[hash].chain;
1193
1194         spin_lock_bh(rt_hash_lock_addr(hash));
1195         while ((rth = rcu_dereference_protected(*rthp,
1196                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1197                 if (rt_is_expired(rth)) {
1198                         *rthp = rth->dst.rt_next;
1199                         rt_free(rth);
1200                         continue;
1201                 }
1202                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1203                         /* Put it first */
1204                         *rthp = rth->dst.rt_next;
1205                         /*
1206                          * Since lookup is lockfree, the deletion
1207                          * must be visible to another weakly ordered CPU before
1208                          * the insertion at the start of the hash chain.
1209                          */
1210                         rcu_assign_pointer(rth->dst.rt_next,
1211                                            rt_hash_table[hash].chain);
1212                         /*
1213                          * Since lookup is lockfree, the update writes
1214                          * must be ordered for consistency on SMP.
1215                          */
1216                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1217
1218                         dst_use(&rth->dst, now);
1219                         spin_unlock_bh(rt_hash_lock_addr(hash));
1220
1221                         rt_drop(rt);
1222                         if (skb)
1223                                 skb_dst_set(skb, &rth->dst);
1224                         return rth;
1225                 }
1226
1227                 if (!atomic_read(&rth->dst.__refcnt)) {
1228                         u32 score = rt_score(rth);
1229
1230                         if (score <= min_score) {
1231                                 cand = rth;
1232                                 candp = rthp;
1233                                 min_score = score;
1234                         }
1235                 }
1236
1237                 chain_length++;
1238
1239                 rthp = &rth->dst.rt_next;
1240         }
1241
1242         if (cand) {
1243                 /* ip_rt_gc_elasticity used to be average length of chain
1244                  * length, when exceeded gc becomes really aggressive.
1245                  *
1246                  * The second limit is less certain. At the moment it allows
1247                  * only 2 entries per bucket. We will see.
1248                  */
1249                 if (chain_length > ip_rt_gc_elasticity) {
1250                         *candp = cand->dst.rt_next;
1251                         rt_free(cand);
1252                 }
1253         } else {
1254                 if (chain_length > rt_chain_length_max &&
1255                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1256                         struct net *net = dev_net(rt->dst.dev);
1257                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1258                         if (!rt_caching(net)) {
1259                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1260                                         rt->dst.dev->name, num);
1261                         }
1262                         rt_emergency_hash_rebuild(net);
1263                         spin_unlock_bh(rt_hash_lock_addr(hash));
1264
1265                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1266                                         ifindex, rt_genid(net));
1267                         goto restart;
1268                 }
1269         }
1270
1271         /* Try to bind route to arp only if it is output
1272            route or unicast forwarding path.
1273          */
1274         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1275                 int err = rt_bind_neighbour(rt);
1276                 if (err) {
1277                         spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279                         if (err != -ENOBUFS) {
1280                                 rt_drop(rt);
1281                                 return ERR_PTR(err);
1282                         }
1283
1284                         /* Neighbour tables are full and nothing
1285                            can be released. Try to shrink route cache,
1286                            it is most likely it holds some neighbour records.
1287                          */
1288                         if (attempts-- > 0) {
1289                                 int saved_elasticity = ip_rt_gc_elasticity;
1290                                 int saved_int = ip_rt_gc_min_interval;
1291                                 ip_rt_gc_elasticity     = 1;
1292                                 ip_rt_gc_min_interval   = 0;
1293                                 rt_garbage_collect(&ipv4_dst_ops);
1294                                 ip_rt_gc_min_interval   = saved_int;
1295                                 ip_rt_gc_elasticity     = saved_elasticity;
1296                                 goto restart;
1297                         }
1298
1299                         net_warn_ratelimited("Neighbour table overflow\n");
1300                         rt_drop(rt);
1301                         return ERR_PTR(-ENOBUFS);
1302                 }
1303         }
1304
1305         rt->dst.rt_next = rt_hash_table[hash].chain;
1306
1307         /*
1308          * Since lookup is lockfree, we must make sure
1309          * previous writes to rt are committed to memory
1310          * before making rt visible to other CPUS.
1311          */
1312         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1313
1314         spin_unlock_bh(rt_hash_lock_addr(hash));
1315
1316 skip_hashing:
1317         if (skb)
1318                 skb_dst_set(skb, &rt->dst);
1319         return rt;
1320 }
1321
1322 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324 static u32 rt_peer_genid(void)
1325 {
1326         return atomic_read(&__rt_peer_genid);
1327 }
1328
1329 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330 {
1331         struct net *net = dev_net(rt->dst.dev);
1332         struct inet_peer *peer;
1333
1334         peer = inet_getpeer_v4(net, daddr, create);
1335
1336         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1337                 inet_putpeer(peer);
1338         else
1339                 rt->rt_peer_genid = rt_peer_genid();
1340 }
1341
1342 /*
1343  * Peer allocation may fail only in serious out-of-memory conditions.  However
1344  * we still can generate some output.
1345  * Random ID selection looks a bit dangerous because we have no chances to
1346  * select ID being unique in a reasonable period of time.
1347  * But broken packet identifier may be better than no packet at all.
1348  */
1349 static void ip_select_fb_ident(struct iphdr *iph)
1350 {
1351         static DEFINE_SPINLOCK(ip_fb_id_lock);
1352         static u32 ip_fallback_id;
1353         u32 salt;
1354
1355         spin_lock_bh(&ip_fb_id_lock);
1356         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1357         iph->id = htons(salt & 0xFFFF);
1358         ip_fallback_id = salt;
1359         spin_unlock_bh(&ip_fb_id_lock);
1360 }
1361
1362 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1363 {
1364         struct rtable *rt = (struct rtable *) dst;
1365
1366         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1367                 if (rt->peer == NULL)
1368                         rt_bind_peer(rt, rt->rt_dst, 1);
1369
1370                 /* If peer is attached to destination, it is never detached,
1371                    so that we need not to grab a lock to dereference it.
1372                  */
1373                 if (rt->peer) {
1374                         iph->id = htons(inet_getid(rt->peer, more));
1375                         return;
1376                 }
1377         } else if (!rt)
1378                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1379
1380         ip_select_fb_ident(iph);
1381 }
1382 EXPORT_SYMBOL(__ip_select_ident);
1383
1384 static void rt_del(unsigned int hash, struct rtable *rt)
1385 {
1386         struct rtable __rcu **rthp;
1387         struct rtable *aux;
1388
1389         rthp = &rt_hash_table[hash].chain;
1390         spin_lock_bh(rt_hash_lock_addr(hash));
1391         ip_rt_put(rt);
1392         while ((aux = rcu_dereference_protected(*rthp,
1393                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1394                 if (aux == rt || rt_is_expired(aux)) {
1395                         *rthp = aux->dst.rt_next;
1396                         rt_free(aux);
1397                         continue;
1398                 }
1399                 rthp = &aux->dst.rt_next;
1400         }
1401         spin_unlock_bh(rt_hash_lock_addr(hash));
1402 }
1403
1404 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1405 {
1406         struct rtable *rt = (struct rtable *) dst;
1407         __be32 orig_gw = rt->rt_gateway;
1408         struct neighbour *n, *old_n;
1409
1410         dst_confirm(&rt->dst);
1411
1412         rt->rt_gateway = peer->redirect_learned.a4;
1413
1414         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1415         if (IS_ERR(n)) {
1416                 rt->rt_gateway = orig_gw;
1417                 return;
1418         }
1419         old_n = xchg(&rt->dst._neighbour, n);
1420         if (old_n)
1421                 neigh_release(old_n);
1422         if (!(n->nud_state & NUD_VALID)) {
1423                 neigh_event_send(n, NULL);
1424         } else {
1425                 rt->rt_flags |= RTCF_REDIRECTED;
1426                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1427         }
1428 }
1429
1430 /* called in rcu_read_lock() section */
1431 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432                     __be32 saddr, struct net_device *dev)
1433 {
1434         int s, i;
1435         struct in_device *in_dev = __in_dev_get_rcu(dev);
1436         __be32 skeys[2] = { saddr, 0 };
1437         int    ikeys[2] = { dev->ifindex, 0 };
1438         struct inet_peer *peer;
1439         struct net *net;
1440
1441         if (!in_dev)
1442                 return;
1443
1444         net = dev_net(dev);
1445         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1446             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1447             ipv4_is_zeronet(new_gw))
1448                 goto reject_redirect;
1449
1450         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1451                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1452                         goto reject_redirect;
1453                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1454                         goto reject_redirect;
1455         } else {
1456                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1457                         goto reject_redirect;
1458         }
1459
1460         for (s = 0; s < 2; s++) {
1461                 for (i = 0; i < 2; i++) {
1462                         unsigned int hash;
1463                         struct rtable __rcu **rthp;
1464                         struct rtable *rt;
1465
1466                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1467
1468                         rthp = &rt_hash_table[hash].chain;
1469
1470                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1471                                 rthp = &rt->dst.rt_next;
1472
1473                                 if (rt->rt_key_dst != daddr ||
1474                                     rt->rt_key_src != skeys[s] ||
1475                                     rt->rt_oif != ikeys[i] ||
1476                                     rt_is_input_route(rt) ||
1477                                     rt_is_expired(rt) ||
1478                                     !net_eq(dev_net(rt->dst.dev), net) ||
1479                                     rt->dst.error ||
1480                                     rt->dst.dev != dev ||
1481                                     rt->rt_gateway != old_gw)
1482                                         continue;
1483
1484                                 if (!rt->peer)
1485                                         rt_bind_peer(rt, rt->rt_dst, 1);
1486
1487                                 peer = rt->peer;
1488                                 if (peer) {
1489                                         if (peer->redirect_learned.a4 != new_gw) {
1490                                                 peer->redirect_learned.a4 = new_gw;
1491                                                 atomic_inc(&__rt_peer_genid);
1492                                         }
1493                                         check_peer_redir(&rt->dst, peer);
1494                                 }
1495                         }
1496                 }
1497         }
1498         return;
1499
1500 reject_redirect:
1501 #ifdef CONFIG_IP_ROUTE_VERBOSE
1502         if (IN_DEV_LOG_MARTIANS(in_dev))
1503                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1504                                      "  Advised path = %pI4 -> %pI4\n",
1505                                      &old_gw, dev->name, &new_gw,
1506                                      &saddr, &daddr);
1507 #endif
1508         ;
1509 }
1510
1511 static bool peer_pmtu_expired(struct inet_peer *peer)
1512 {
1513         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1514
1515         return orig &&
1516                time_after_eq(jiffies, orig) &&
1517                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1518 }
1519
1520 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1521 {
1522         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1523
1524         return orig &&
1525                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1526 }
1527
1528 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1529 {
1530         struct rtable *rt = (struct rtable *)dst;
1531         struct dst_entry *ret = dst;
1532
1533         if (rt) {
1534                 if (dst->obsolete > 0) {
1535                         ip_rt_put(rt);
1536                         ret = NULL;
1537                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1538                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1539                                                 rt->rt_oif,
1540                                                 rt_genid(dev_net(dst->dev)));
1541                         rt_del(hash, rt);
1542                         ret = NULL;
1543                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1544                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1545                 }
1546         }
1547         return ret;
1548 }
1549
1550 /*
1551  * Algorithm:
1552  *      1. The first ip_rt_redirect_number redirects are sent
1553  *         with exponential backoff, then we stop sending them at all,
1554  *         assuming that the host ignores our redirects.
1555  *      2. If we did not see packets requiring redirects
1556  *         during ip_rt_redirect_silence, we assume that the host
1557  *         forgot redirected route and start to send redirects again.
1558  *
1559  * This algorithm is much cheaper and more intelligent than dumb load limiting
1560  * in icmp.c.
1561  *
1562  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1563  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1564  */
1565
1566 void ip_rt_send_redirect(struct sk_buff *skb)
1567 {
1568         struct rtable *rt = skb_rtable(skb);
1569         struct in_device *in_dev;
1570         struct inet_peer *peer;
1571         int log_martians;
1572
1573         rcu_read_lock();
1574         in_dev = __in_dev_get_rcu(rt->dst.dev);
1575         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1576                 rcu_read_unlock();
1577                 return;
1578         }
1579         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1580         rcu_read_unlock();
1581
1582         if (!rt->peer)
1583                 rt_bind_peer(rt, rt->rt_dst, 1);
1584         peer = rt->peer;
1585         if (!peer) {
1586                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1587                 return;
1588         }
1589
1590         /* No redirected packets during ip_rt_redirect_silence;
1591          * reset the algorithm.
1592          */
1593         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1594                 peer->rate_tokens = 0;
1595
1596         /* Too many ignored redirects; do not send anything
1597          * set dst.rate_last to the last seen redirected packet.
1598          */
1599         if (peer->rate_tokens >= ip_rt_redirect_number) {
1600                 peer->rate_last = jiffies;
1601                 return;
1602         }
1603
1604         /* Check for load limit; set rate_last to the latest sent
1605          * redirect.
1606          */
1607         if (peer->rate_tokens == 0 ||
1608             time_after(jiffies,
1609                        (peer->rate_last +
1610                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1611                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1612                 peer->rate_last = jiffies;
1613                 ++peer->rate_tokens;
1614 #ifdef CONFIG_IP_ROUTE_VERBOSE
1615                 if (log_martians &&
1616                     peer->rate_tokens == ip_rt_redirect_number)
1617                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1618                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1619                                              &rt->rt_dst, &rt->rt_gateway);
1620 #endif
1621         }
1622 }
1623
1624 static int ip_error(struct sk_buff *skb)
1625 {
1626         struct rtable *rt = skb_rtable(skb);
1627         struct inet_peer *peer;
1628         unsigned long now;
1629         bool send;
1630         int code;
1631
1632         switch (rt->dst.error) {
1633         case EINVAL:
1634         default:
1635                 goto out;
1636         case EHOSTUNREACH:
1637                 code = ICMP_HOST_UNREACH;
1638                 break;
1639         case ENETUNREACH:
1640                 code = ICMP_NET_UNREACH;
1641                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1642                                 IPSTATS_MIB_INNOROUTES);
1643                 break;
1644         case EACCES:
1645                 code = ICMP_PKT_FILTERED;
1646                 break;
1647         }
1648
1649         if (!rt->peer)
1650                 rt_bind_peer(rt, rt->rt_dst, 1);
1651         peer = rt->peer;
1652
1653         send = true;
1654         if (peer) {
1655                 now = jiffies;
1656                 peer->rate_tokens += now - peer->rate_last;
1657                 if (peer->rate_tokens > ip_rt_error_burst)
1658                         peer->rate_tokens = ip_rt_error_burst;
1659                 peer->rate_last = now;
1660                 if (peer->rate_tokens >= ip_rt_error_cost)
1661                         peer->rate_tokens -= ip_rt_error_cost;
1662                 else
1663                         send = false;
1664         }
1665         if (send)
1666                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1667
1668 out:    kfree_skb(skb);
1669         return 0;
1670 }
1671
1672 /*
1673  *      The last two values are not from the RFC but
1674  *      are needed for AMPRnet AX.25 paths.
1675  */
1676
1677 static const unsigned short mtu_plateau[] =
1678 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1679
1680 static inline unsigned short guess_mtu(unsigned short old_mtu)
1681 {
1682         int i;
1683
1684         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1685                 if (old_mtu > mtu_plateau[i])
1686                         return mtu_plateau[i];
1687         return 68;
1688 }
1689
1690 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1691                                  unsigned short new_mtu,
1692                                  struct net_device *dev)
1693 {
1694         unsigned short old_mtu = ntohs(iph->tot_len);
1695         unsigned short est_mtu = 0;
1696         struct inet_peer *peer;
1697
1698         peer = inet_getpeer_v4(net, iph->daddr, 1);
1699         if (peer) {
1700                 unsigned short mtu = new_mtu;
1701
1702                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1703                         /* BSD 4.2 derived systems incorrectly adjust
1704                          * tot_len by the IP header length, and report
1705                          * a zero MTU in the ICMP message.
1706                          */
1707                         if (mtu == 0 &&
1708                             old_mtu >= 68 + (iph->ihl << 2))
1709                                 old_mtu -= iph->ihl << 2;
1710                         mtu = guess_mtu(old_mtu);
1711                 }
1712
1713                 if (mtu < ip_rt_min_pmtu)
1714                         mtu = ip_rt_min_pmtu;
1715                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1716                         unsigned long pmtu_expires;
1717
1718                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1719                         if (!pmtu_expires)
1720                                 pmtu_expires = 1UL;
1721
1722                         est_mtu = mtu;
1723                         peer->pmtu_learned = mtu;
1724                         peer->pmtu_expires = pmtu_expires;
1725                         atomic_inc(&__rt_peer_genid);
1726                 }
1727
1728                 inet_putpeer(peer);
1729         }
1730         return est_mtu ? : new_mtu;
1731 }
1732
1733 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1734 {
1735         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1736
1737         if (!expires)
1738                 return;
1739         if (time_before(jiffies, expires)) {
1740                 u32 orig_dst_mtu = dst_mtu(dst);
1741                 if (peer->pmtu_learned < orig_dst_mtu) {
1742                         if (!peer->pmtu_orig)
1743                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1744                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1745                 }
1746         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1747                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1748 }
1749
1750 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1751 {
1752         struct rtable *rt = (struct rtable *) dst;
1753         struct inet_peer *peer;
1754
1755         dst_confirm(dst);
1756
1757         if (!rt->peer)
1758                 rt_bind_peer(rt, rt->rt_dst, 1);
1759         peer = rt->peer;
1760         if (peer) {
1761                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1762
1763                 if (mtu < ip_rt_min_pmtu)
1764                         mtu = ip_rt_min_pmtu;
1765                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1766
1767                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1768                         if (!pmtu_expires)
1769                                 pmtu_expires = 1UL;
1770
1771                         peer->pmtu_learned = mtu;
1772                         peer->pmtu_expires = pmtu_expires;
1773
1774                         atomic_inc(&__rt_peer_genid);
1775                         rt->rt_peer_genid = rt_peer_genid();
1776                 }
1777                 check_peer_pmtu(dst, peer);
1778         }
1779 }
1780
1781
1782 static void ipv4_validate_peer(struct rtable *rt)
1783 {
1784         if (rt->rt_peer_genid != rt_peer_genid()) {
1785                 struct inet_peer *peer;
1786
1787                 if (!rt->peer)
1788                         rt_bind_peer(rt, rt->rt_dst, 0);
1789
1790                 peer = rt->peer;
1791                 if (peer) {
1792                         check_peer_pmtu(&rt->dst, peer);
1793
1794                         if (peer->redirect_learned.a4 &&
1795                             peer->redirect_learned.a4 != rt->rt_gateway)
1796                                 check_peer_redir(&rt->dst, peer);
1797                 }
1798
1799                 rt->rt_peer_genid = rt_peer_genid();
1800         }
1801 }
1802
1803 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1804 {
1805         struct rtable *rt = (struct rtable *) dst;
1806
1807         if (rt_is_expired(rt))
1808                 return NULL;
1809         ipv4_validate_peer(rt);
1810         return dst;
1811 }
1812
1813 static void ipv4_dst_destroy(struct dst_entry *dst)
1814 {
1815         struct rtable *rt = (struct rtable *) dst;
1816         struct inet_peer *peer = rt->peer;
1817
1818         if (rt->fi) {
1819                 fib_info_put(rt->fi);
1820                 rt->fi = NULL;
1821         }
1822         if (peer) {
1823                 rt->peer = NULL;
1824                 inet_putpeer(peer);
1825         }
1826 }
1827
1828
1829 static void ipv4_link_failure(struct sk_buff *skb)
1830 {
1831         struct rtable *rt;
1832
1833         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1834
1835         rt = skb_rtable(skb);
1836         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1837                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1838 }
1839
1840 static int ip_rt_bug(struct sk_buff *skb)
1841 {
1842         pr_debug("%s: %pI4 -> %pI4, %s\n",
1843                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1844                  skb->dev ? skb->dev->name : "?");
1845         kfree_skb(skb);
1846         WARN_ON(1);
1847         return 0;
1848 }
1849
1850 /*
1851    We do not cache source address of outgoing interface,
1852    because it is used only by IP RR, TS and SRR options,
1853    so that it out of fast path.
1854
1855    BTW remember: "addr" is allowed to be not aligned
1856    in IP options!
1857  */
1858
1859 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1860 {
1861         __be32 src;
1862
1863         if (rt_is_output_route(rt))
1864                 src = ip_hdr(skb)->saddr;
1865         else {
1866                 struct fib_result res;
1867                 struct flowi4 fl4;
1868                 struct iphdr *iph;
1869
1870                 iph = ip_hdr(skb);
1871
1872                 memset(&fl4, 0, sizeof(fl4));
1873                 fl4.daddr = iph->daddr;
1874                 fl4.saddr = iph->saddr;
1875                 fl4.flowi4_tos = RT_TOS(iph->tos);
1876                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1877                 fl4.flowi4_iif = skb->dev->ifindex;
1878                 fl4.flowi4_mark = skb->mark;
1879
1880                 rcu_read_lock();
1881                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1882                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1883                 else
1884                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1885                                         RT_SCOPE_UNIVERSE);
1886                 rcu_read_unlock();
1887         }
1888         memcpy(addr, &src, 4);
1889 }
1890
1891 #ifdef CONFIG_IP_ROUTE_CLASSID
1892 static void set_class_tag(struct rtable *rt, u32 tag)
1893 {
1894         if (!(rt->dst.tclassid & 0xFFFF))
1895                 rt->dst.tclassid |= tag & 0xFFFF;
1896         if (!(rt->dst.tclassid & 0xFFFF0000))
1897                 rt->dst.tclassid |= tag & 0xFFFF0000;
1898 }
1899 #endif
1900
1901 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1902 {
1903         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1904
1905         if (advmss == 0) {
1906                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1907                                ip_rt_min_advmss);
1908                 if (advmss > 65535 - 40)
1909                         advmss = 65535 - 40;
1910         }
1911         return advmss;
1912 }
1913
1914 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1915 {
1916         const struct rtable *rt = (const struct rtable *) dst;
1917         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1918
1919         if (mtu && rt_is_output_route(rt))
1920                 return mtu;
1921
1922         mtu = dst->dev->mtu;
1923
1924         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1925
1926                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1927                         mtu = 576;
1928         }
1929
1930         if (mtu > IP_MAX_MTU)
1931                 mtu = IP_MAX_MTU;
1932
1933         return mtu;
1934 }
1935
1936 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1937                             struct fib_info *fi)
1938 {
1939         struct net *net = dev_net(rt->dst.dev);
1940         struct inet_peer *peer;
1941         int create = 0;
1942
1943         /* If a peer entry exists for this destination, we must hook
1944          * it up in order to get at cached metrics.
1945          */
1946         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1947                 create = 1;
1948
1949         rt->peer = peer = inet_getpeer_v4(net, rt->rt_dst, create);
1950         if (peer) {
1951                 rt->rt_peer_genid = rt_peer_genid();
1952                 if (inet_metrics_new(peer))
1953                         memcpy(peer->metrics, fi->fib_metrics,
1954                                sizeof(u32) * RTAX_MAX);
1955                 dst_init_metrics(&rt->dst, peer->metrics, false);
1956
1957                 check_peer_pmtu(&rt->dst, peer);
1958
1959                 if (peer->redirect_learned.a4 &&
1960                     peer->redirect_learned.a4 != rt->rt_gateway) {
1961                         rt->rt_gateway = peer->redirect_learned.a4;
1962                         rt->rt_flags |= RTCF_REDIRECTED;
1963                 }
1964         } else {
1965                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1966                         rt->fi = fi;
1967                         atomic_inc(&fi->fib_clntref);
1968                 }
1969                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1970         }
1971 }
1972
1973 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1974                            const struct fib_result *res,
1975                            struct fib_info *fi, u16 type, u32 itag)
1976 {
1977         struct dst_entry *dst = &rt->dst;
1978
1979         if (fi) {
1980                 if (FIB_RES_GW(*res) &&
1981                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1982                         rt->rt_gateway = FIB_RES_GW(*res);
1983                 rt_init_metrics(rt, fl4, fi);
1984 #ifdef CONFIG_IP_ROUTE_CLASSID
1985                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1986 #endif
1987         }
1988
1989         if (dst_mtu(dst) > IP_MAX_MTU)
1990                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1991         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1992                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1993
1994 #ifdef CONFIG_IP_ROUTE_CLASSID
1995 #ifdef CONFIG_IP_MULTIPLE_TABLES
1996         set_class_tag(rt, fib_rules_tclass(res));
1997 #endif
1998         set_class_tag(rt, itag);
1999 #endif
2000 }
2001
2002 static struct rtable *rt_dst_alloc(struct net_device *dev,
2003                                    bool nopolicy, bool noxfrm)
2004 {
2005         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2006                          DST_HOST |
2007                          (nopolicy ? DST_NOPOLICY : 0) |
2008                          (noxfrm ? DST_NOXFRM : 0));
2009 }
2010
2011 /* called in rcu_read_lock() section */
2012 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2013                                 u8 tos, struct net_device *dev, int our)
2014 {
2015         unsigned int hash;
2016         struct rtable *rth;
2017         __be32 spec_dst;
2018         struct in_device *in_dev = __in_dev_get_rcu(dev);
2019         u32 itag = 0;
2020         int err;
2021
2022         /* Primary sanity checks. */
2023
2024         if (in_dev == NULL)
2025                 return -EINVAL;
2026
2027         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2028             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2029                 goto e_inval;
2030
2031         if (ipv4_is_zeronet(saddr)) {
2032                 if (!ipv4_is_local_multicast(daddr))
2033                         goto e_inval;
2034                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2035         } else {
2036                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2037                                           &itag);
2038                 if (err < 0)
2039                         goto e_err;
2040         }
2041         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2042                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2043         if (!rth)
2044                 goto e_nobufs;
2045
2046 #ifdef CONFIG_IP_ROUTE_CLASSID
2047         rth->dst.tclassid = itag;
2048 #endif
2049         rth->dst.output = ip_rt_bug;
2050
2051         rth->rt_key_dst = daddr;
2052         rth->rt_key_src = saddr;
2053         rth->rt_genid   = rt_genid(dev_net(dev));
2054         rth->rt_flags   = RTCF_MULTICAST;
2055         rth->rt_type    = RTN_MULTICAST;
2056         rth->rt_key_tos = tos;
2057         rth->rt_dst     = daddr;
2058         rth->rt_src     = saddr;
2059         rth->rt_route_iif = dev->ifindex;
2060         rth->rt_iif     = dev->ifindex;
2061         rth->rt_oif     = 0;
2062         rth->rt_mark    = skb->mark;
2063         rth->rt_gateway = daddr;
2064         rth->rt_spec_dst= spec_dst;
2065         rth->rt_peer_genid = 0;
2066         rth->peer = NULL;
2067         rth->fi = NULL;
2068         if (our) {
2069                 rth->dst.input= ip_local_deliver;
2070                 rth->rt_flags |= RTCF_LOCAL;
2071         }
2072
2073 #ifdef CONFIG_IP_MROUTE
2074         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2075                 rth->dst.input = ip_mr_input;
2076 #endif
2077         RT_CACHE_STAT_INC(in_slow_mc);
2078
2079         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2080         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2081         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2082
2083 e_nobufs:
2084         return -ENOBUFS;
2085 e_inval:
2086         return -EINVAL;
2087 e_err:
2088         return err;
2089 }
2090
2091
2092 static void ip_handle_martian_source(struct net_device *dev,
2093                                      struct in_device *in_dev,
2094                                      struct sk_buff *skb,
2095                                      __be32 daddr,
2096                                      __be32 saddr)
2097 {
2098         RT_CACHE_STAT_INC(in_martian_src);
2099 #ifdef CONFIG_IP_ROUTE_VERBOSE
2100         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2101                 /*
2102                  *      RFC1812 recommendation, if source is martian,
2103                  *      the only hint is MAC header.
2104                  */
2105                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2106                         &daddr, &saddr, dev->name);
2107                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2108                         print_hex_dump(KERN_WARNING, "ll header: ",
2109                                        DUMP_PREFIX_OFFSET, 16, 1,
2110                                        skb_mac_header(skb),
2111                                        dev->hard_header_len, true);
2112                 }
2113         }
2114 #endif
2115 }
2116
2117 /* called in rcu_read_lock() section */
2118 static int __mkroute_input(struct sk_buff *skb,
2119                            const struct fib_result *res,
2120                            struct in_device *in_dev,
2121                            __be32 daddr, __be32 saddr, u32 tos,
2122                            struct rtable **result)
2123 {
2124         struct rtable *rth;
2125         int err;
2126         struct in_device *out_dev;
2127         unsigned int flags = 0;
2128         __be32 spec_dst;
2129         u32 itag;
2130
2131         /* get a working reference to the output device */
2132         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2133         if (out_dev == NULL) {
2134                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2135                 return -EINVAL;
2136         }
2137
2138
2139         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2140                                   in_dev->dev, &spec_dst, &itag);
2141         if (err < 0) {
2142                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2143                                          saddr);
2144
2145                 goto cleanup;
2146         }
2147
2148         if (err)
2149                 flags |= RTCF_DIRECTSRC;
2150
2151         if (out_dev == in_dev && err &&
2152             (IN_DEV_SHARED_MEDIA(out_dev) ||
2153              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2154                 flags |= RTCF_DOREDIRECT;
2155
2156         if (skb->protocol != htons(ETH_P_IP)) {
2157                 /* Not IP (i.e. ARP). Do not create route, if it is
2158                  * invalid for proxy arp. DNAT routes are always valid.
2159                  *
2160                  * Proxy arp feature have been extended to allow, ARP
2161                  * replies back to the same interface, to support
2162                  * Private VLAN switch technologies. See arp.c.
2163                  */
2164                 if (out_dev == in_dev &&
2165                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2166                         err = -EINVAL;
2167                         goto cleanup;
2168                 }
2169         }
2170
2171         rth = rt_dst_alloc(out_dev->dev,
2172                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2173                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2174         if (!rth) {
2175                 err = -ENOBUFS;
2176                 goto cleanup;
2177         }
2178
2179         rth->rt_key_dst = daddr;
2180         rth->rt_key_src = saddr;
2181         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2182         rth->rt_flags = flags;
2183         rth->rt_type = res->type;
2184         rth->rt_key_tos = tos;
2185         rth->rt_dst     = daddr;
2186         rth->rt_src     = saddr;
2187         rth->rt_route_iif = in_dev->dev->ifindex;
2188         rth->rt_iif     = in_dev->dev->ifindex;
2189         rth->rt_oif     = 0;
2190         rth->rt_mark    = skb->mark;
2191         rth->rt_gateway = daddr;
2192         rth->rt_spec_dst= spec_dst;
2193         rth->rt_peer_genid = 0;
2194         rth->peer = NULL;
2195         rth->fi = NULL;
2196
2197         rth->dst.input = ip_forward;
2198         rth->dst.output = ip_output;
2199
2200         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2201
2202         *result = rth;
2203         err = 0;
2204  cleanup:
2205         return err;
2206 }
2207
2208 static int ip_mkroute_input(struct sk_buff *skb,
2209                             struct fib_result *res,
2210                             const struct flowi4 *fl4,
2211                             struct in_device *in_dev,
2212                             __be32 daddr, __be32 saddr, u32 tos)
2213 {
2214         struct rtable *rth = NULL;
2215         int err;
2216         unsigned int hash;
2217
2218 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2219         if (res->fi && res->fi->fib_nhs > 1)
2220                 fib_select_multipath(res);
2221 #endif
2222
2223         /* create a routing cache entry */
2224         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2225         if (err)
2226                 return err;
2227
2228         /* put it into the cache */
2229         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2230                        rt_genid(dev_net(rth->dst.dev)));
2231         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2232         if (IS_ERR(rth))
2233                 return PTR_ERR(rth);
2234         return 0;
2235 }
2236
2237 /*
2238  *      NOTE. We drop all the packets that has local source
2239  *      addresses, because every properly looped back packet
2240  *      must have correct destination already attached by output routine.
2241  *
2242  *      Such approach solves two big problems:
2243  *      1. Not simplex devices are handled properly.
2244  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2245  *      called with rcu_read_lock()
2246  */
2247
2248 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2249                                u8 tos, struct net_device *dev)
2250 {
2251         struct fib_result res;
2252         struct in_device *in_dev = __in_dev_get_rcu(dev);
2253         struct flowi4   fl4;
2254         unsigned int    flags = 0;
2255         u32             itag = 0;
2256         struct rtable   *rth;
2257         unsigned int    hash;
2258         __be32          spec_dst;
2259         int             err = -EINVAL;
2260         struct net    *net = dev_net(dev);
2261
2262         /* IP on this device is disabled. */
2263
2264         if (!in_dev)
2265                 goto out;
2266
2267         /* Check for the most weird martians, which can be not detected
2268            by fib_lookup.
2269          */
2270
2271         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2272             ipv4_is_loopback(saddr))
2273                 goto martian_source;
2274
2275         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2276                 goto brd_input;
2277
2278         /* Accept zero addresses only to limited broadcast;
2279          * I even do not know to fix it or not. Waiting for complains :-)
2280          */
2281         if (ipv4_is_zeronet(saddr))
2282                 goto martian_source;
2283
2284         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2285                 goto martian_destination;
2286
2287         /*
2288          *      Now we are ready to route packet.
2289          */
2290         fl4.flowi4_oif = 0;
2291         fl4.flowi4_iif = dev->ifindex;
2292         fl4.flowi4_mark = skb->mark;
2293         fl4.flowi4_tos = tos;
2294         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2295         fl4.daddr = daddr;
2296         fl4.saddr = saddr;
2297         err = fib_lookup(net, &fl4, &res);
2298         if (err != 0) {
2299                 if (!IN_DEV_FORWARD(in_dev))
2300                         goto e_hostunreach;
2301                 goto no_route;
2302         }
2303
2304         RT_CACHE_STAT_INC(in_slow_tot);
2305
2306         if (res.type == RTN_BROADCAST)
2307                 goto brd_input;
2308
2309         if (res.type == RTN_LOCAL) {
2310                 err = fib_validate_source(skb, saddr, daddr, tos,
2311                                           net->loopback_dev->ifindex,
2312                                           dev, &spec_dst, &itag);
2313                 if (err < 0)
2314                         goto martian_source_keep_err;
2315                 if (err)
2316                         flags |= RTCF_DIRECTSRC;
2317                 spec_dst = daddr;
2318                 goto local_input;
2319         }
2320
2321         if (!IN_DEV_FORWARD(in_dev))
2322                 goto e_hostunreach;
2323         if (res.type != RTN_UNICAST)
2324                 goto martian_destination;
2325
2326         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2327 out:    return err;
2328
2329 brd_input:
2330         if (skb->protocol != htons(ETH_P_IP))
2331                 goto e_inval;
2332
2333         if (ipv4_is_zeronet(saddr))
2334                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2335         else {
2336                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2337                                           &itag);
2338                 if (err < 0)
2339                         goto martian_source_keep_err;
2340                 if (err)
2341                         flags |= RTCF_DIRECTSRC;
2342         }
2343         flags |= RTCF_BROADCAST;
2344         res.type = RTN_BROADCAST;
2345         RT_CACHE_STAT_INC(in_brd);
2346
2347 local_input:
2348         rth = rt_dst_alloc(net->loopback_dev,
2349                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2350         if (!rth)
2351                 goto e_nobufs;
2352
2353         rth->dst.input= ip_local_deliver;
2354         rth->dst.output= ip_rt_bug;
2355 #ifdef CONFIG_IP_ROUTE_CLASSID
2356         rth->dst.tclassid = itag;
2357 #endif
2358
2359         rth->rt_key_dst = daddr;
2360         rth->rt_key_src = saddr;
2361         rth->rt_genid = rt_genid(net);
2362         rth->rt_flags   = flags|RTCF_LOCAL;
2363         rth->rt_type    = res.type;
2364         rth->rt_key_tos = tos;
2365         rth->rt_dst     = daddr;
2366         rth->rt_src     = saddr;
2367 #ifdef CONFIG_IP_ROUTE_CLASSID
2368         rth->dst.tclassid = itag;
2369 #endif
2370         rth->rt_route_iif = dev->ifindex;
2371         rth->rt_iif     = dev->ifindex;
2372         rth->rt_oif     = 0;
2373         rth->rt_mark    = skb->mark;
2374         rth->rt_gateway = daddr;
2375         rth->rt_spec_dst= spec_dst;
2376         rth->rt_peer_genid = 0;
2377         rth->peer = NULL;
2378         rth->fi = NULL;
2379         if (res.type == RTN_UNREACHABLE) {
2380                 rth->dst.input= ip_error;
2381                 rth->dst.error= -err;
2382                 rth->rt_flags   &= ~RTCF_LOCAL;
2383         }
2384         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2385         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2386         err = 0;
2387         if (IS_ERR(rth))
2388                 err = PTR_ERR(rth);
2389         goto out;
2390
2391 no_route:
2392         RT_CACHE_STAT_INC(in_no_route);
2393         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2394         res.type = RTN_UNREACHABLE;
2395         if (err == -ESRCH)
2396                 err = -ENETUNREACH;
2397         goto local_input;
2398
2399         /*
2400          *      Do not cache martian addresses: they should be logged (RFC1812)
2401          */
2402 martian_destination:
2403         RT_CACHE_STAT_INC(in_martian_dst);
2404 #ifdef CONFIG_IP_ROUTE_VERBOSE
2405         if (IN_DEV_LOG_MARTIANS(in_dev))
2406                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2407                                      &daddr, &saddr, dev->name);
2408 #endif
2409
2410 e_hostunreach:
2411         err = -EHOSTUNREACH;
2412         goto out;
2413
2414 e_inval:
2415         err = -EINVAL;
2416         goto out;
2417
2418 e_nobufs:
2419         err = -ENOBUFS;
2420         goto out;
2421
2422 martian_source:
2423         err = -EINVAL;
2424 martian_source_keep_err:
2425         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2426         goto out;
2427 }
2428
2429 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2430                            u8 tos, struct net_device *dev, bool noref)
2431 {
2432         struct rtable   *rth;
2433         unsigned int    hash;
2434         int iif = dev->ifindex;
2435         struct net *net;
2436         int res;
2437
2438         net = dev_net(dev);
2439
2440         rcu_read_lock();
2441
2442         if (!rt_caching(net))
2443                 goto skip_cache;
2444
2445         tos &= IPTOS_RT_MASK;
2446         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2447
2448         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2449              rth = rcu_dereference(rth->dst.rt_next)) {
2450                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2451                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2452                      (rth->rt_route_iif ^ iif) |
2453                      (rth->rt_key_tos ^ tos)) == 0 &&
2454                     rth->rt_mark == skb->mark &&
2455                     net_eq(dev_net(rth->dst.dev), net) &&
2456                     !rt_is_expired(rth)) {
2457                         ipv4_validate_peer(rth);
2458                         if (noref) {
2459                                 dst_use_noref(&rth->dst, jiffies);
2460                                 skb_dst_set_noref(skb, &rth->dst);
2461                         } else {
2462                                 dst_use(&rth->dst, jiffies);
2463                                 skb_dst_set(skb, &rth->dst);
2464                         }
2465                         RT_CACHE_STAT_INC(in_hit);
2466                         rcu_read_unlock();
2467                         return 0;
2468                 }
2469                 RT_CACHE_STAT_INC(in_hlist_search);
2470         }
2471
2472 skip_cache:
2473         /* Multicast recognition logic is moved from route cache to here.
2474            The problem was that too many Ethernet cards have broken/missing
2475            hardware multicast filters :-( As result the host on multicasting
2476            network acquires a lot of useless route cache entries, sort of
2477            SDR messages from all the world. Now we try to get rid of them.
2478            Really, provided software IP multicast filter is organized
2479            reasonably (at least, hashed), it does not result in a slowdown
2480            comparing with route cache reject entries.
2481            Note, that multicast routers are not affected, because
2482            route cache entry is created eventually.
2483          */
2484         if (ipv4_is_multicast(daddr)) {
2485                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2486
2487                 if (in_dev) {
2488                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2489                                                   ip_hdr(skb)->protocol);
2490                         if (our
2491 #ifdef CONFIG_IP_MROUTE
2492                                 ||
2493                             (!ipv4_is_local_multicast(daddr) &&
2494                              IN_DEV_MFORWARD(in_dev))
2495 #endif
2496                            ) {
2497                                 int res = ip_route_input_mc(skb, daddr, saddr,
2498                                                             tos, dev, our);
2499                                 rcu_read_unlock();
2500                                 return res;
2501                         }
2502                 }
2503                 rcu_read_unlock();
2504                 return -EINVAL;
2505         }
2506         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2507         rcu_read_unlock();
2508         return res;
2509 }
2510 EXPORT_SYMBOL(ip_route_input_common);
2511
2512 /* called with rcu_read_lock() */
2513 static struct rtable *__mkroute_output(const struct fib_result *res,
2514                                        const struct flowi4 *fl4,
2515                                        __be32 orig_daddr, __be32 orig_saddr,
2516                                        int orig_oif, __u8 orig_rtos,
2517                                        struct net_device *dev_out,
2518                                        unsigned int flags)
2519 {
2520         struct fib_info *fi = res->fi;
2521         struct in_device *in_dev;
2522         u16 type = res->type;
2523         struct rtable *rth;
2524
2525         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2526                 return ERR_PTR(-EINVAL);
2527
2528         if (ipv4_is_lbcast(fl4->daddr))
2529                 type = RTN_BROADCAST;
2530         else if (ipv4_is_multicast(fl4->daddr))
2531                 type = RTN_MULTICAST;
2532         else if (ipv4_is_zeronet(fl4->daddr))
2533                 return ERR_PTR(-EINVAL);
2534
2535         if (dev_out->flags & IFF_LOOPBACK)
2536                 flags |= RTCF_LOCAL;
2537
2538         in_dev = __in_dev_get_rcu(dev_out);
2539         if (!in_dev)
2540                 return ERR_PTR(-EINVAL);
2541
2542         if (type == RTN_BROADCAST) {
2543                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2544                 fi = NULL;
2545         } else if (type == RTN_MULTICAST) {
2546                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2547                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2548                                      fl4->flowi4_proto))
2549                         flags &= ~RTCF_LOCAL;
2550                 /* If multicast route do not exist use
2551                  * default one, but do not gateway in this case.
2552                  * Yes, it is hack.
2553                  */
2554                 if (fi && res->prefixlen < 4)
2555                         fi = NULL;
2556         }
2557
2558         rth = rt_dst_alloc(dev_out,
2559                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2560                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2561         if (!rth)
2562                 return ERR_PTR(-ENOBUFS);
2563
2564         rth->dst.output = ip_output;
2565
2566         rth->rt_key_dst = orig_daddr;
2567         rth->rt_key_src = orig_saddr;
2568         rth->rt_genid = rt_genid(dev_net(dev_out));
2569         rth->rt_flags   = flags;
2570         rth->rt_type    = type;
2571         rth->rt_key_tos = orig_rtos;
2572         rth->rt_dst     = fl4->daddr;
2573         rth->rt_src     = fl4->saddr;
2574         rth->rt_route_iif = 0;
2575         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2576         rth->rt_oif     = orig_oif;
2577         rth->rt_mark    = fl4->flowi4_mark;
2578         rth->rt_gateway = fl4->daddr;
2579         rth->rt_spec_dst= fl4->saddr;
2580         rth->rt_peer_genid = 0;
2581         rth->peer = NULL;
2582         rth->fi = NULL;
2583
2584         RT_CACHE_STAT_INC(out_slow_tot);
2585
2586         if (flags & RTCF_LOCAL) {
2587                 rth->dst.input = ip_local_deliver;
2588                 rth->rt_spec_dst = fl4->daddr;
2589         }
2590         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2591                 rth->rt_spec_dst = fl4->saddr;
2592                 if (flags & RTCF_LOCAL &&
2593                     !(dev_out->flags & IFF_LOOPBACK)) {
2594                         rth->dst.output = ip_mc_output;
2595                         RT_CACHE_STAT_INC(out_slow_mc);
2596                 }
2597 #ifdef CONFIG_IP_MROUTE
2598                 if (type == RTN_MULTICAST) {
2599                         if (IN_DEV_MFORWARD(in_dev) &&
2600                             !ipv4_is_local_multicast(fl4->daddr)) {
2601                                 rth->dst.input = ip_mr_input;
2602                                 rth->dst.output = ip_mc_output;
2603                         }
2604                 }
2605 #endif
2606         }
2607
2608         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2609
2610         return rth;
2611 }
2612
2613 /*
2614  * Major route resolver routine.
2615  * called with rcu_read_lock();
2616  */
2617
2618 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2619 {
2620         struct net_device *dev_out = NULL;
2621         __u8 tos = RT_FL_TOS(fl4);
2622         unsigned int flags = 0;
2623         struct fib_result res;
2624         struct rtable *rth;
2625         __be32 orig_daddr;
2626         __be32 orig_saddr;
2627         int orig_oif;
2628
2629         res.fi          = NULL;
2630 #ifdef CONFIG_IP_MULTIPLE_TABLES
2631         res.r           = NULL;
2632 #endif
2633
2634         orig_daddr = fl4->daddr;
2635         orig_saddr = fl4->saddr;
2636         orig_oif = fl4->flowi4_oif;
2637
2638         fl4->flowi4_iif = net->loopback_dev->ifindex;
2639         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2640         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2641                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2642
2643         rcu_read_lock();
2644         if (fl4->saddr) {
2645                 rth = ERR_PTR(-EINVAL);
2646                 if (ipv4_is_multicast(fl4->saddr) ||
2647                     ipv4_is_lbcast(fl4->saddr) ||
2648                     ipv4_is_zeronet(fl4->saddr))
2649                         goto out;
2650
2651                 /* I removed check for oif == dev_out->oif here.
2652                    It was wrong for two reasons:
2653                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2654                       is assigned to multiple interfaces.
2655                    2. Moreover, we are allowed to send packets with saddr
2656                       of another iface. --ANK
2657                  */
2658
2659                 if (fl4->flowi4_oif == 0 &&
2660                     (ipv4_is_multicast(fl4->daddr) ||
2661                      ipv4_is_lbcast(fl4->daddr))) {
2662                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2663                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2664                         if (dev_out == NULL)
2665                                 goto out;
2666
2667                         /* Special hack: user can direct multicasts
2668                            and limited broadcast via necessary interface
2669                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2670                            This hack is not just for fun, it allows
2671                            vic,vat and friends to work.
2672                            They bind socket to loopback, set ttl to zero
2673                            and expect that it will work.
2674                            From the viewpoint of routing cache they are broken,
2675                            because we are not allowed to build multicast path
2676                            with loopback source addr (look, routing cache
2677                            cannot know, that ttl is zero, so that packet
2678                            will not leave this host and route is valid).
2679                            Luckily, this hack is good workaround.
2680                          */
2681
2682                         fl4->flowi4_oif = dev_out->ifindex;
2683                         goto make_route;
2684                 }
2685
2686                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2687                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2688                         if (!__ip_dev_find(net, fl4->saddr, false))
2689                                 goto out;
2690                 }
2691         }
2692
2693
2694         if (fl4->flowi4_oif) {
2695                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2696                 rth = ERR_PTR(-ENODEV);
2697                 if (dev_out == NULL)
2698                         goto out;
2699
2700                 /* RACE: Check return value of inet_select_addr instead. */
2701                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2702                         rth = ERR_PTR(-ENETUNREACH);
2703                         goto out;
2704                 }
2705                 if (ipv4_is_local_multicast(fl4->daddr) ||
2706                     ipv4_is_lbcast(fl4->daddr)) {
2707                         if (!fl4->saddr)
2708                                 fl4->saddr = inet_select_addr(dev_out, 0,
2709                                                               RT_SCOPE_LINK);
2710                         goto make_route;
2711                 }
2712                 if (fl4->saddr) {
2713                         if (ipv4_is_multicast(fl4->daddr))
2714                                 fl4->saddr = inet_select_addr(dev_out, 0,
2715                                                               fl4->flowi4_scope);
2716                         else if (!fl4->daddr)
2717                                 fl4->saddr = inet_select_addr(dev_out, 0,
2718                                                               RT_SCOPE_HOST);
2719                 }
2720         }
2721
2722         if (!fl4->daddr) {
2723                 fl4->daddr = fl4->saddr;
2724                 if (!fl4->daddr)
2725                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2726                 dev_out = net->loopback_dev;
2727                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2728                 res.type = RTN_LOCAL;
2729                 flags |= RTCF_LOCAL;
2730                 goto make_route;
2731         }
2732
2733         if (fib_lookup(net, fl4, &res)) {
2734                 res.fi = NULL;
2735                 if (fl4->flowi4_oif) {
2736                         /* Apparently, routing tables are wrong. Assume,
2737                            that the destination is on link.
2738
2739                            WHY? DW.
2740                            Because we are allowed to send to iface
2741                            even if it has NO routes and NO assigned
2742                            addresses. When oif is specified, routing
2743                            tables are looked up with only one purpose:
2744                            to catch if destination is gatewayed, rather than
2745                            direct. Moreover, if MSG_DONTROUTE is set,
2746                            we send packet, ignoring both routing tables
2747                            and ifaddr state. --ANK
2748
2749
2750                            We could make it even if oif is unknown,
2751                            likely IPv6, but we do not.
2752                          */
2753
2754                         if (fl4->saddr == 0)
2755                                 fl4->saddr = inet_select_addr(dev_out, 0,
2756                                                               RT_SCOPE_LINK);
2757                         res.type = RTN_UNICAST;
2758                         goto make_route;
2759                 }
2760                 rth = ERR_PTR(-ENETUNREACH);
2761                 goto out;
2762         }
2763
2764         if (res.type == RTN_LOCAL) {
2765                 if (!fl4->saddr) {
2766                         if (res.fi->fib_prefsrc)
2767                                 fl4->saddr = res.fi->fib_prefsrc;
2768                         else
2769                                 fl4->saddr = fl4->daddr;
2770                 }
2771                 dev_out = net->loopback_dev;
2772                 fl4->flowi4_oif = dev_out->ifindex;
2773                 res.fi = NULL;
2774                 flags |= RTCF_LOCAL;
2775                 goto make_route;
2776         }
2777
2778 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2779         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2780                 fib_select_multipath(&res);
2781         else
2782 #endif
2783         if (!res.prefixlen &&
2784             res.table->tb_num_default > 1 &&
2785             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2786                 fib_select_default(&res);
2787
2788         if (!fl4->saddr)
2789                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2790
2791         dev_out = FIB_RES_DEV(res);
2792         fl4->flowi4_oif = dev_out->ifindex;
2793
2794
2795 make_route:
2796         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2797                                tos, dev_out, flags);
2798         if (!IS_ERR(rth)) {
2799                 unsigned int hash;
2800
2801                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2802                                rt_genid(dev_net(dev_out)));
2803                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2804         }
2805
2806 out:
2807         rcu_read_unlock();
2808         return rth;
2809 }
2810
2811 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2812 {
2813         struct rtable *rth;
2814         unsigned int hash;
2815
2816         if (!rt_caching(net))
2817                 goto slow_output;
2818
2819         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2820
2821         rcu_read_lock_bh();
2822         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2823                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2824                 if (rth->rt_key_dst == flp4->daddr &&
2825                     rth->rt_key_src == flp4->saddr &&
2826                     rt_is_output_route(rth) &&
2827                     rth->rt_oif == flp4->flowi4_oif &&
2828                     rth->rt_mark == flp4->flowi4_mark &&
2829                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2830                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2831                     net_eq(dev_net(rth->dst.dev), net) &&
2832                     !rt_is_expired(rth)) {
2833                         ipv4_validate_peer(rth);
2834                         dst_use(&rth->dst, jiffies);
2835                         RT_CACHE_STAT_INC(out_hit);
2836                         rcu_read_unlock_bh();
2837                         if (!flp4->saddr)
2838                                 flp4->saddr = rth->rt_src;
2839                         if (!flp4->daddr)
2840                                 flp4->daddr = rth->rt_dst;
2841                         return rth;
2842                 }
2843                 RT_CACHE_STAT_INC(out_hlist_search);
2844         }
2845         rcu_read_unlock_bh();
2846
2847 slow_output:
2848         return ip_route_output_slow(net, flp4);
2849 }
2850 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2851
2852 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2853 {
2854         return NULL;
2855 }
2856
2857 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2858 {
2859         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2860
2861         return mtu ? : dst->dev->mtu;
2862 }
2863
2864 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2865 {
2866 }
2867
2868 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2869                                           unsigned long old)
2870 {
2871         return NULL;
2872 }
2873
2874 static struct dst_ops ipv4_dst_blackhole_ops = {
2875         .family                 =       AF_INET,
2876         .protocol               =       cpu_to_be16(ETH_P_IP),
2877         .destroy                =       ipv4_dst_destroy,
2878         .check                  =       ipv4_blackhole_dst_check,
2879         .mtu                    =       ipv4_blackhole_mtu,
2880         .default_advmss         =       ipv4_default_advmss,
2881         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2882         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2883         .neigh_lookup           =       ipv4_neigh_lookup,
2884 };
2885
2886 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2887 {
2888         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2889         struct rtable *ort = (struct rtable *) dst_orig;
2890
2891         if (rt) {
2892                 struct dst_entry *new = &rt->dst;
2893
2894                 new->__use = 1;
2895                 new->input = dst_discard;
2896                 new->output = dst_discard;
2897                 dst_copy_metrics(new, &ort->dst);
2898
2899                 new->dev = ort->dst.dev;
2900                 if (new->dev)
2901                         dev_hold(new->dev);
2902
2903                 rt->rt_key_dst = ort->rt_key_dst;
2904                 rt->rt_key_src = ort->rt_key_src;
2905                 rt->rt_key_tos = ort->rt_key_tos;
2906                 rt->rt_route_iif = ort->rt_route_iif;
2907                 rt->rt_iif = ort->rt_iif;
2908                 rt->rt_oif = ort->rt_oif;
2909                 rt->rt_mark = ort->rt_mark;
2910
2911                 rt->rt_genid = rt_genid(net);
2912                 rt->rt_flags = ort->rt_flags;
2913                 rt->rt_type = ort->rt_type;
2914                 rt->rt_dst = ort->rt_dst;
2915                 rt->rt_src = ort->rt_src;
2916                 rt->rt_gateway = ort->rt_gateway;
2917                 rt->rt_spec_dst = ort->rt_spec_dst;
2918                 rt->peer = ort->peer;
2919                 if (rt->peer)
2920                         atomic_inc(&rt->peer->refcnt);
2921                 rt->fi = ort->fi;
2922                 if (rt->fi)
2923                         atomic_inc(&rt->fi->fib_clntref);
2924
2925                 dst_free(new);
2926         }
2927
2928         dst_release(dst_orig);
2929
2930         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2931 }
2932
2933 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2934                                     struct sock *sk)
2935 {
2936         struct rtable *rt = __ip_route_output_key(net, flp4);
2937
2938         if (IS_ERR(rt))
2939                 return rt;
2940
2941         if (flp4->flowi4_proto)
2942                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2943                                                    flowi4_to_flowi(flp4),
2944                                                    sk, 0);
2945
2946         return rt;
2947 }
2948 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2949
2950 static int rt_fill_info(struct net *net,
2951                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2952                         int nowait, unsigned int flags)
2953 {
2954         struct rtable *rt = skb_rtable(skb);
2955         struct rtmsg *r;
2956         struct nlmsghdr *nlh;
2957         unsigned long expires = 0;
2958         const struct inet_peer *peer = rt->peer;
2959         u32 id = 0, ts = 0, tsage = 0, error;
2960
2961         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2962         if (nlh == NULL)
2963                 return -EMSGSIZE;
2964
2965         r = nlmsg_data(nlh);
2966         r->rtm_family    = AF_INET;
2967         r->rtm_dst_len  = 32;
2968         r->rtm_src_len  = 0;
2969         r->rtm_tos      = rt->rt_key_tos;
2970         r->rtm_table    = RT_TABLE_MAIN;
2971         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2972                 goto nla_put_failure;
2973         r->rtm_type     = rt->rt_type;
2974         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2975         r->rtm_protocol = RTPROT_UNSPEC;
2976         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2977         if (rt->rt_flags & RTCF_NOTIFY)
2978                 r->rtm_flags |= RTM_F_NOTIFY;
2979
2980         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2981                 goto nla_put_failure;
2982         if (rt->rt_key_src) {
2983                 r->rtm_src_len = 32;
2984                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2985                         goto nla_put_failure;
2986         }
2987         if (rt->dst.dev &&
2988             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2989                 goto nla_put_failure;
2990 #ifdef CONFIG_IP_ROUTE_CLASSID
2991         if (rt->dst.tclassid &&
2992             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2993                 goto nla_put_failure;
2994 #endif
2995         if (rt_is_input_route(rt)) {
2996                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2997                         goto nla_put_failure;
2998         } else if (rt->rt_src != rt->rt_key_src) {
2999                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
3000                         goto nla_put_failure;
3001         }
3002         if (rt->rt_dst != rt->rt_gateway &&
3003             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3004                 goto nla_put_failure;
3005
3006         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3007                 goto nla_put_failure;
3008
3009         if (rt->rt_mark &&
3010             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3011                 goto nla_put_failure;
3012
3013         error = rt->dst.error;
3014         if (peer) {
3015                 inet_peer_refcheck(rt->peer);
3016                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3017                 if (peer->tcp_ts_stamp) {
3018                         ts = peer->tcp_ts;
3019                         tsage = get_seconds() - peer->tcp_ts_stamp;
3020                 }
3021                 expires = ACCESS_ONCE(peer->pmtu_expires);
3022                 if (expires) {
3023                         if (time_before(jiffies, expires))
3024                                 expires -= jiffies;
3025                         else
3026                                 expires = 0;
3027                 }
3028         }
3029
3030         if (rt_is_input_route(rt)) {
3031 #ifdef CONFIG_IP_MROUTE
3032                 __be32 dst = rt->rt_dst;
3033
3034                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3035                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3036                         int err = ipmr_get_route(net, skb,
3037                                                  rt->rt_src, rt->rt_dst,
3038                                                  r, nowait);
3039                         if (err <= 0) {
3040                                 if (!nowait) {
3041                                         if (err == 0)
3042                                                 return 0;
3043                                         goto nla_put_failure;
3044                                 } else {
3045                                         if (err == -EMSGSIZE)
3046                                                 goto nla_put_failure;
3047                                         error = err;
3048                                 }
3049                         }
3050                 } else
3051 #endif
3052                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3053                                 goto nla_put_failure;
3054         }
3055
3056         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3057                                expires, error) < 0)
3058                 goto nla_put_failure;
3059
3060         return nlmsg_end(skb, nlh);
3061
3062 nla_put_failure:
3063         nlmsg_cancel(skb, nlh);
3064         return -EMSGSIZE;
3065 }
3066
3067 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3068 {
3069         struct net *net = sock_net(in_skb->sk);
3070         struct rtmsg *rtm;
3071         struct nlattr *tb[RTA_MAX+1];
3072         struct rtable *rt = NULL;
3073         __be32 dst = 0;
3074         __be32 src = 0;
3075         u32 iif;
3076         int err;
3077         int mark;
3078         struct sk_buff *skb;
3079
3080         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3081         if (err < 0)
3082                 goto errout;
3083
3084         rtm = nlmsg_data(nlh);
3085
3086         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3087         if (skb == NULL) {
3088                 err = -ENOBUFS;
3089                 goto errout;
3090         }
3091
3092         /* Reserve room for dummy headers, this skb can pass
3093            through good chunk of routing engine.
3094          */
3095         skb_reset_mac_header(skb);
3096         skb_reset_network_header(skb);
3097
3098         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3099         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3100         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3101
3102         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3103         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3104         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3105         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3106
3107         if (iif) {
3108                 struct net_device *dev;
3109
3110                 dev = __dev_get_by_index(net, iif);
3111                 if (dev == NULL) {
3112                         err = -ENODEV;
3113                         goto errout_free;
3114                 }
3115
3116                 skb->protocol   = htons(ETH_P_IP);
3117                 skb->dev        = dev;
3118                 skb->mark       = mark;
3119                 local_bh_disable();
3120                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3121                 local_bh_enable();
3122
3123                 rt = skb_rtable(skb);
3124                 if (err == 0 && rt->dst.error)
3125                         err = -rt->dst.error;
3126         } else {
3127                 struct flowi4 fl4 = {
3128                         .daddr = dst,
3129                         .saddr = src,
3130                         .flowi4_tos = rtm->rtm_tos,
3131                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3132                         .flowi4_mark = mark,
3133                 };
3134                 rt = ip_route_output_key(net, &fl4);
3135
3136                 err = 0;
3137                 if (IS_ERR(rt))
3138                         err = PTR_ERR(rt);
3139         }
3140
3141         if (err)
3142                 goto errout_free;
3143
3144         skb_dst_set(skb, &rt->dst);
3145         if (rtm->rtm_flags & RTM_F_NOTIFY)
3146                 rt->rt_flags |= RTCF_NOTIFY;
3147
3148         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3149                            RTM_NEWROUTE, 0, 0);
3150         if (err <= 0)
3151                 goto errout_free;
3152
3153         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3154 errout:
3155         return err;
3156
3157 errout_free:
3158         kfree_skb(skb);
3159         goto errout;
3160 }
3161
3162 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3163 {
3164         struct rtable *rt;
3165         int h, s_h;
3166         int idx, s_idx;
3167         struct net *net;
3168
3169         net = sock_net(skb->sk);
3170
3171         s_h = cb->args[0];
3172         if (s_h < 0)
3173                 s_h = 0;
3174         s_idx = idx = cb->args[1];
3175         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3176                 if (!rt_hash_table[h].chain)
3177                         continue;
3178                 rcu_read_lock_bh();
3179                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3180                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3181                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3182                                 continue;
3183                         if (rt_is_expired(rt))
3184                                 continue;
3185                         skb_dst_set_noref(skb, &rt->dst);
3186                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3187                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3188                                          1, NLM_F_MULTI) <= 0) {
3189                                 skb_dst_drop(skb);
3190                                 rcu_read_unlock_bh();
3191                                 goto done;
3192                         }
3193                         skb_dst_drop(skb);
3194                 }
3195                 rcu_read_unlock_bh();
3196         }
3197
3198 done:
3199         cb->args[0] = h;
3200         cb->args[1] = idx;
3201         return skb->len;
3202 }
3203
3204 void ip_rt_multicast_event(struct in_device *in_dev)
3205 {
3206         rt_cache_flush(dev_net(in_dev->dev), 0);
3207 }
3208
3209 #ifdef CONFIG_SYSCTL
3210 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3211                                         void __user *buffer,
3212                                         size_t *lenp, loff_t *ppos)
3213 {
3214         if (write) {
3215                 int flush_delay;
3216                 ctl_table ctl;
3217                 struct net *net;
3218
3219                 memcpy(&ctl, __ctl, sizeof(ctl));
3220                 ctl.data = &flush_delay;
3221                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3222
3223                 net = (struct net *)__ctl->extra1;
3224                 rt_cache_flush(net, flush_delay);
3225                 return 0;
3226         }
3227
3228         return -EINVAL;
3229 }
3230
3231 static ctl_table ipv4_route_table[] = {
3232         {
3233                 .procname       = "gc_thresh",
3234                 .data           = &ipv4_dst_ops.gc_thresh,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = proc_dointvec,
3238         },
3239         {
3240                 .procname       = "max_size",
3241                 .data           = &ip_rt_max_size,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 /*  Deprecated. Use gc_min_interval_ms */
3248
3249                 .procname       = "gc_min_interval",
3250                 .data           = &ip_rt_gc_min_interval,
3251                 .maxlen         = sizeof(int),
3252                 .mode           = 0644,
3253                 .proc_handler   = proc_dointvec_jiffies,
3254         },
3255         {
3256                 .procname       = "gc_min_interval_ms",
3257                 .data           = &ip_rt_gc_min_interval,
3258                 .maxlen         = sizeof(int),
3259                 .mode           = 0644,
3260                 .proc_handler   = proc_dointvec_ms_jiffies,
3261         },
3262         {
3263                 .procname       = "gc_timeout",
3264                 .data           = &ip_rt_gc_timeout,
3265                 .maxlen         = sizeof(int),
3266                 .mode           = 0644,
3267                 .proc_handler   = proc_dointvec_jiffies,
3268         },
3269         {
3270                 .procname       = "gc_interval",
3271                 .data           = &ip_rt_gc_interval,
3272                 .maxlen         = sizeof(int),
3273                 .mode           = 0644,
3274                 .proc_handler   = proc_dointvec_jiffies,
3275         },
3276         {
3277                 .procname       = "redirect_load",
3278                 .data           = &ip_rt_redirect_load,
3279                 .maxlen         = sizeof(int),
3280                 .mode           = 0644,
3281                 .proc_handler   = proc_dointvec,
3282         },
3283         {
3284                 .procname       = "redirect_number",
3285                 .data           = &ip_rt_redirect_number,
3286                 .maxlen         = sizeof(int),
3287                 .mode           = 0644,
3288                 .proc_handler   = proc_dointvec,
3289         },
3290         {
3291                 .procname       = "redirect_silence",
3292                 .data           = &ip_rt_redirect_silence,
3293                 .maxlen         = sizeof(int),
3294                 .mode           = 0644,
3295                 .proc_handler   = proc_dointvec,
3296         },
3297         {
3298                 .procname       = "error_cost",
3299                 .data           = &ip_rt_error_cost,
3300                 .maxlen         = sizeof(int),
3301                 .mode           = 0644,
3302                 .proc_handler   = proc_dointvec,
3303         },
3304         {
3305                 .procname       = "error_burst",
3306                 .data           = &ip_rt_error_burst,
3307                 .maxlen         = sizeof(int),
3308                 .mode           = 0644,
3309                 .proc_handler   = proc_dointvec,
3310         },
3311         {
3312                 .procname       = "gc_elasticity",
3313                 .data           = &ip_rt_gc_elasticity,
3314                 .maxlen         = sizeof(int),
3315                 .mode           = 0644,
3316                 .proc_handler   = proc_dointvec,
3317         },
3318         {
3319                 .procname       = "mtu_expires",
3320                 .data           = &ip_rt_mtu_expires,
3321                 .maxlen         = sizeof(int),
3322                 .mode           = 0644,
3323                 .proc_handler   = proc_dointvec_jiffies,
3324         },
3325         {
3326                 .procname       = "min_pmtu",
3327                 .data           = &ip_rt_min_pmtu,
3328                 .maxlen         = sizeof(int),
3329                 .mode           = 0644,
3330                 .proc_handler   = proc_dointvec,
3331         },
3332         {
3333                 .procname       = "min_adv_mss",
3334                 .data           = &ip_rt_min_advmss,
3335                 .maxlen         = sizeof(int),
3336                 .mode           = 0644,
3337                 .proc_handler   = proc_dointvec,
3338         },
3339         { }
3340 };
3341
3342 static struct ctl_table ipv4_route_flush_table[] = {
3343         {
3344                 .procname       = "flush",
3345                 .maxlen         = sizeof(int),
3346                 .mode           = 0200,
3347                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3348         },
3349         { },
3350 };
3351
3352 static __net_init int sysctl_route_net_init(struct net *net)
3353 {
3354         struct ctl_table *tbl;
3355
3356         tbl = ipv4_route_flush_table;
3357         if (!net_eq(net, &init_net)) {
3358                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3359                 if (tbl == NULL)
3360                         goto err_dup;
3361         }
3362         tbl[0].extra1 = net;
3363
3364         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3365         if (net->ipv4.route_hdr == NULL)
3366                 goto err_reg;
3367         return 0;
3368
3369 err_reg:
3370         if (tbl != ipv4_route_flush_table)
3371                 kfree(tbl);
3372 err_dup:
3373         return -ENOMEM;
3374 }
3375
3376 static __net_exit void sysctl_route_net_exit(struct net *net)
3377 {
3378         struct ctl_table *tbl;
3379
3380         tbl = net->ipv4.route_hdr->ctl_table_arg;
3381         unregister_net_sysctl_table(net->ipv4.route_hdr);
3382         BUG_ON(tbl == ipv4_route_flush_table);
3383         kfree(tbl);
3384 }
3385
3386 static __net_initdata struct pernet_operations sysctl_route_ops = {
3387         .init = sysctl_route_net_init,
3388         .exit = sysctl_route_net_exit,
3389 };
3390 #endif
3391
3392 static __net_init int rt_genid_init(struct net *net)
3393 {
3394         get_random_bytes(&net->ipv4.rt_genid,
3395                          sizeof(net->ipv4.rt_genid));
3396         get_random_bytes(&net->ipv4.dev_addr_genid,
3397                          sizeof(net->ipv4.dev_addr_genid));
3398         return 0;
3399 }
3400
3401 static __net_initdata struct pernet_operations rt_genid_ops = {
3402         .init = rt_genid_init,
3403 };
3404
3405
3406 #ifdef CONFIG_IP_ROUTE_CLASSID
3407 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3408 #endif /* CONFIG_IP_ROUTE_CLASSID */
3409
3410 static __initdata unsigned long rhash_entries;
3411 static int __init set_rhash_entries(char *str)
3412 {
3413         ssize_t ret;
3414
3415         if (!str)
3416                 return 0;
3417
3418         ret = kstrtoul(str, 0, &rhash_entries);
3419         if (ret)
3420                 return 0;
3421
3422         return 1;
3423 }
3424 __setup("rhash_entries=", set_rhash_entries);
3425
3426 int __init ip_rt_init(void)
3427 {
3428         int rc = 0;
3429
3430 #ifdef CONFIG_IP_ROUTE_CLASSID
3431         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3432         if (!ip_rt_acct)
3433                 panic("IP: failed to allocate ip_rt_acct\n");
3434 #endif
3435
3436         ipv4_dst_ops.kmem_cachep =
3437                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3438                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3439
3440         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3441
3442         if (dst_entries_init(&ipv4_dst_ops) < 0)
3443                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3444
3445         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3446                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3447
3448         rt_hash_table = (struct rt_hash_bucket *)
3449                 alloc_large_system_hash("IP route cache",
3450                                         sizeof(struct rt_hash_bucket),
3451                                         rhash_entries,
3452                                         (totalram_pages >= 128 * 1024) ?
3453                                         15 : 17,
3454                                         0,
3455                                         &rt_hash_log,
3456                                         &rt_hash_mask,
3457                                         0,
3458                                         rhash_entries ? 0 : 512 * 1024);
3459         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3460         rt_hash_lock_init();
3461
3462         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3463         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3464
3465         devinet_init();
3466         ip_fib_init();
3467
3468         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3469         expires_ljiffies = jiffies;
3470         schedule_delayed_work(&expires_work,
3471                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3472
3473         if (ip_rt_proc_init())
3474                 pr_err("Unable to create route proc files\n");
3475 #ifdef CONFIG_XFRM
3476         xfrm_init();
3477         xfrm4_init(ip_rt_max_size);
3478 #endif
3479         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3480
3481 #ifdef CONFIG_SYSCTL
3482         register_pernet_subsys(&sysctl_route_ops);
3483 #endif
3484         register_pernet_subsys(&rt_genid_ops);
3485         return rc;
3486 }
3487
3488 #ifdef CONFIG_SYSCTL
3489 /*
3490  * We really need to sanitize the damn ipv4 init order, then all
3491  * this nonsense will go away.
3492  */
3493 void __init ip_static_sysctl_init(void)
3494 {
3495         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3496 }
3497 #endif