net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         peer = rt_get_peer_create(rt, rt->rt_dst);
 166         if (peer) {
 167                 u32 *old_p = __DST_METRICS_PTR(old);
 168                 unsigned long prev, new;
 169
 170                 p = peer->metrics;
 171                 if (inet_metrics_new(peer))
 172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 173
 174                 new = (unsigned long) p;
 175                 prev = cmpxchg(&dst->_metrics, old, new);
 176
 177                 if (prev != old) {
 178                         p = __DST_METRICS_PTR(prev);
 179                         if (prev & DST_METRICS_READ_ONLY)
 180                                 p = NULL;
 181                 } else {
 182                         if (rt->fi) {
 183                                 fib_info_put(rt->fi);
 184                                 rt->fi = NULL;
 185                         }
 186                 }
 187         }
 188         return p;
 189 }
 190
 191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 192                                            struct sk_buff *skb,
 193                                            const void *daddr);
 194
 195 static struct dst_ops ipv4_dst_ops = {
 196         .family =               AF_INET,
 197         .protocol =             cpu_to_be16(ETH_P_IP),
 198         .gc =                   rt_garbage_collect,
 199         .check =                ipv4_dst_check,
 200         .default_advmss =       ipv4_default_advmss,
 201         .mtu =                  ipv4_mtu,
 202         .cow_metrics =          ipv4_cow_metrics,
 203         .destroy =              ipv4_dst_destroy,
 204         .ifdown =               ipv4_dst_ifdown,
 205         .negative_advice =      ipv4_negative_advice,
 206         .link_failure =         ipv4_link_failure,
 207         .update_pmtu =          ip_rt_update_pmtu,
 208         .local_out =            __ip_local_out,
 209         .neigh_lookup =         ipv4_neigh_lookup,
 210 };
 211
 212 #define ECN_OR_COST(class)      TC_PRIO_##class
 213
 214 const __u8 ip_tos2prio[16] = {
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BESTEFFORT,
 218         ECN_OR_COST(BESTEFFORT),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_BULK,
 222         ECN_OR_COST(BULK),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE,
 226         ECN_OR_COST(INTERACTIVE),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK),
 229         TC_PRIO_INTERACTIVE_BULK,
 230         ECN_OR_COST(INTERACTIVE_BULK)
 231 };
 232 EXPORT_SYMBOL(ip_tos2prio);
 233
 234 /*
 235  * Route cache.
 236  */
 237
 238 /* The locking scheme is rather straight forward:
 239  *
 240  * 1) Read-Copy Update protects the buckets of the central route hash.
 241  * 2) Only writers remove entries, and they hold the lock
 242  *    as they look at rtable reference counts.
 243  * 3) Only readers acquire references to rtable entries,
 244  *    they do so with atomic increments and with the
 245  *    lock held.
 246  */
 247
 248 struct rt_hash_bucket {
 249         struct rtable __rcu     *chain;
 250 };
 251
 252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 253         defined(CONFIG_PROVE_LOCKING)
 254 /*
 255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 256  * The size of this table is a power of two and depends on the number of CPUS.
 257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 258  */
 259 #ifdef CONFIG_LOCKDEP
 260 # define RT_HASH_LOCK_SZ        256
 261 #else
 262 # if NR_CPUS >= 32
 263 #  define RT_HASH_LOCK_SZ       4096
 264 # elif NR_CPUS >= 16
 265 #  define RT_HASH_LOCK_SZ       2048
 266 # elif NR_CPUS >= 8
 267 #  define RT_HASH_LOCK_SZ       1024
 268 # elif NR_CPUS >= 4
 269 #  define RT_HASH_LOCK_SZ       512
 270 # else
 271 #  define RT_HASH_LOCK_SZ       256
 272 # endif
 273 #endif
 274
 275 static spinlock_t       *rt_hash_locks;
 276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 277
 278 static __init void rt_hash_lock_init(void)
 279 {
 280         int i;
 281
 282         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 283                         GFP_KERNEL);
 284         if (!rt_hash_locks)
 285                 panic("IP: failed to allocate rt_hash_locks\n");
 286
 287         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 288                 spin_lock_init(&rt_hash_locks[i]);
 289 }
 290 #else
 291 # define rt_hash_lock_addr(slot) NULL
 292
 293 static inline void rt_hash_lock_init(void)
 294 {
 295 }
 296 #endif
 297
 298 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 299 static unsigned int             rt_hash_mask __read_mostly;
 300 static unsigned int             rt_hash_log  __read_mostly;
 301
 302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 304
 305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 306                                    int genid)
 307 {
 308         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 309                             idx, genid)
 310                 & rt_hash_mask;
 311 }
 312
 313 static inline int rt_genid(struct net *net)
 314 {
 315         return atomic_read(&net->ipv4.rt_genid);
 316 }
 317
 318 #ifdef CONFIG_PROC_FS
 319 struct rt_cache_iter_state {
 320         struct seq_net_private p;
 321         int bucket;
 322         int genid;
 323 };
 324
 325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328         struct rtable *r = NULL;
 329
 330         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 331                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 332                         continue;
 333                 rcu_read_lock_bh();
 334                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 335                 while (r) {
 336                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 337                             r->rt_genid == st->genid)
 338                                 return r;
 339                         r = rcu_dereference_bh(r->dst.rt_next);
 340                 }
 341                 rcu_read_unlock_bh();
 342         }
 343         return r;
 344 }
 345
 346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 347                                           struct rtable *r)
 348 {
 349         struct rt_cache_iter_state *st = seq->private;
 350
 351         r = rcu_dereference_bh(r->dst.rt_next);
 352         while (!r) {
 353                 rcu_read_unlock_bh();
 354                 do {
 355                         if (--st->bucket < 0)
 356                                 return NULL;
 357                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 358                 rcu_read_lock_bh();
 359                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 360         }
 361         return r;
 362 }
 363
 364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 365                                         struct rtable *r)
 366 {
 367         struct rt_cache_iter_state *st = seq->private;
 368         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 369                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 370                         continue;
 371                 if (r->rt_genid == st->genid)
 372                         break;
 373         }
 374         return r;
 375 }
 376
 377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 378 {
 379         struct rtable *r = rt_cache_get_first(seq);
 380
 381         if (r)
 382                 while (pos && (r = rt_cache_get_next(seq, r)))
 383                         --pos;
 384         return pos ? NULL : r;
 385 }
 386
 387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 388 {
 389         struct rt_cache_iter_state *st = seq->private;
 390         if (*pos)
 391                 return rt_cache_get_idx(seq, *pos - 1);
 392         st->genid = rt_genid(seq_file_net(seq));
 393         return SEQ_START_TOKEN;
 394 }
 395
 396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 397 {
 398         struct rtable *r;
 399
 400         if (v == SEQ_START_TOKEN)
 401                 r = rt_cache_get_first(seq);
 402         else
 403                 r = rt_cache_get_next(seq, v);
 404         ++*pos;
 405         return r;
 406 }
 407
 408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 409 {
 410         if (v && v != SEQ_START_TOKEN)
 411                 rcu_read_unlock_bh();
 412 }
 413
 414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 415 {
 416         if (v == SEQ_START_TOKEN)
 417                 seq_printf(seq, "%-127s\n",
 418                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 419                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 420                            "HHUptod\tSpecDst");
 421         else {
 422                 struct rtable *r = v;
 423                 int len;
 424
 425                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 426                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 427                         r->dst.dev ? r->dst.dev->name : "*",
 428                         (__force u32)r->rt_dst,
 429                         (__force u32)r->rt_gateway,
 430                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 431                         r->dst.__use, 0, (__force u32)r->rt_src,
 432                         dst_metric_advmss(&r->dst) + 40,
 433                         dst_metric(&r->dst, RTAX_WINDOW),
 434                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 435                               dst_metric(&r->dst, RTAX_RTTVAR)),
 436                         r->rt_key_tos,
 437                         -1, 0, 0, &len);
 438
 439                 seq_printf(seq, "%*s\n", 127 - len, "");
 440         }
 441         return 0;
 442 }
 443
 444 static const struct seq_operations rt_cache_seq_ops = {
 445         .start  = rt_cache_seq_start,
 446         .next   = rt_cache_seq_next,
 447         .stop   = rt_cache_seq_stop,
 448         .show   = rt_cache_seq_show,
 449 };
 450
 451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 452 {
 453         return seq_open_net(inode, file, &rt_cache_seq_ops,
 454                         sizeof(struct rt_cache_iter_state));
 455 }
 456
 457 static const struct file_operations rt_cache_seq_fops = {
 458         .owner   = THIS_MODULE,
 459         .open    = rt_cache_seq_open,
 460         .read    = seq_read,
 461         .llseek  = seq_lseek,
 462         .release = seq_release_net,
 463 };
 464
 465
 466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 467 {
 468         int cpu;
 469
 470         if (*pos == 0)
 471                 return SEQ_START_TOKEN;
 472
 473         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 474                 if (!cpu_possible(cpu))
 475                         continue;
 476                 *pos = cpu+1;
 477                 return &per_cpu(rt_cache_stat, cpu);
 478         }
 479         return NULL;
 480 }
 481
 482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 483 {
 484         int cpu;
 485
 486         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 487                 if (!cpu_possible(cpu))
 488                         continue;
 489                 *pos = cpu+1;
 490                 return &per_cpu(rt_cache_stat, cpu);
 491         }
 492         return NULL;
 493
 494 }
 495
 496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 497 {
 498
 499 }
 500
 501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 502 {
 503         struct rt_cache_stat *st = v;
 504
 505         if (v == SEQ_START_TOKEN) {
 506                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 507                 return 0;
 508         }
 509
 510         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 511                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 512                    dst_entries_get_slow(&ipv4_dst_ops),
 513                    st->in_hit,
 514                    st->in_slow_tot,
 515                    st->in_slow_mc,
 516                    st->in_no_route,
 517                    st->in_brd,
 518                    st->in_martian_dst,
 519                    st->in_martian_src,
 520
 521                    st->out_hit,
 522                    st->out_slow_tot,
 523                    st->out_slow_mc,
 524
 525                    st->gc_total,
 526                    st->gc_ignored,
 527                    st->gc_goal_miss,
 528                    st->gc_dst_overflow,
 529                    st->in_hlist_search,
 530                    st->out_hlist_search
 531                 );
 532         return 0;
 533 }
 534
 535 static const struct seq_operations rt_cpu_seq_ops = {
 536         .start  = rt_cpu_seq_start,
 537         .next   = rt_cpu_seq_next,
 538         .stop   = rt_cpu_seq_stop,
 539         .show   = rt_cpu_seq_show,
 540 };
 541
 542
 543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 544 {
 545         return seq_open(file, &rt_cpu_seq_ops);
 546 }
 547
 548 static const struct file_operations rt_cpu_seq_fops = {
 549         .owner   = THIS_MODULE,
 550         .open    = rt_cpu_seq_open,
 551         .read    = seq_read,
 552         .llseek  = seq_lseek,
 553         .release = seq_release,
 554 };
 555
 556 #ifdef CONFIG_IP_ROUTE_CLASSID
 557 static int rt_acct_proc_show(struct seq_file *m, void *v)
 558 {
 559         struct ip_rt_acct *dst, *src;
 560         unsigned int i, j;
 561
 562         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 563         if (!dst)
 564                 return -ENOMEM;
 565
 566         for_each_possible_cpu(i) {
 567                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 568                 for (j = 0; j < 256; j++) {
 569                         dst[j].o_bytes   += src[j].o_bytes;
 570                         dst[j].o_packets += src[j].o_packets;
 571                         dst[j].i_bytes   += src[j].i_bytes;
 572                         dst[j].i_packets += src[j].i_packets;
 573                 }
 574         }
 575
 576         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 577         kfree(dst);
 578         return 0;
 579 }
 580
 581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 582 {
 583         return single_open(file, rt_acct_proc_show, NULL);
 584 }
 585
 586 static const struct file_operations rt_acct_proc_fops = {
 587         .owner          = THIS_MODULE,
 588         .open           = rt_acct_proc_open,
 589         .read           = seq_read,
 590         .llseek         = seq_lseek,
 591         .release        = single_release,
 592 };
 593 #endif
 594
 595 static int __net_init ip_rt_do_proc_init(struct net *net)
 596 {
 597         struct proc_dir_entry *pde;
 598
 599         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 600                         &rt_cache_seq_fops);
 601         if (!pde)
 602                 goto err1;
 603
 604         pde = proc_create("rt_cache", S_IRUGO,
 605                           net->proc_net_stat, &rt_cpu_seq_fops);
 606         if (!pde)
 607                 goto err2;
 608
 609 #ifdef CONFIG_IP_ROUTE_CLASSID
 610         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 611         if (!pde)
 612                 goto err3;
 613 #endif
 614         return 0;
 615
 616 #ifdef CONFIG_IP_ROUTE_CLASSID
 617 err3:
 618         remove_proc_entry("rt_cache", net->proc_net_stat);
 619 #endif
 620 err2:
 621         remove_proc_entry("rt_cache", net->proc_net);
 622 err1:
 623         return -ENOMEM;
 624 }
 625
 626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 627 {
 628         remove_proc_entry("rt_cache", net->proc_net_stat);
 629         remove_proc_entry("rt_cache", net->proc_net);
 630 #ifdef CONFIG_IP_ROUTE_CLASSID
 631         remove_proc_entry("rt_acct", net->proc_net);
 632 #endif
 633 }
 634
 635 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 636         .init = ip_rt_do_proc_init,
 637         .exit = ip_rt_do_proc_exit,
 638 };
 639
 640 static int __init ip_rt_proc_init(void)
 641 {
 642         return register_pernet_subsys(&ip_rt_proc_ops);
 643 }
 644
 645 #else
 646 static inline int ip_rt_proc_init(void)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_PROC_FS */
 651
 652 static inline void rt_free(struct rtable *rt)
 653 {
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline void rt_drop(struct rtable *rt)
 658 {
 659         ip_rt_put(rt);
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline int rt_fast_clean(struct rtable *rth)
 664 {
 665         /* Kill broadcast/multicast entries very aggresively, if they
 666            collide in hash table with more useful entries */
 667         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 668                 rt_is_input_route(rth) && rth->dst.rt_next;
 669 }
 670
 671 static inline int rt_valuable(struct rtable *rth)
 672 {
 673         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 674                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 675 }
 676
 677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 678 {
 679         unsigned long age;
 680         int ret = 0;
 681
 682         if (atomic_read(&rth->dst.__refcnt))
 683                 goto out;
 684
 685         age = jiffies - rth->dst.lastuse;
 686         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687             (age <= tmo2 && rt_valuable(rth)))
 688                 goto out;
 689         ret = 1;
 690 out:    return ret;
 691 }
 692
 693 /* Bits of score are:
 694  * 31: very valuable
 695  * 30: not quite useless
 696  * 29..0: usage counter
 697  */
 698 static inline u32 rt_score(struct rtable *rt)
 699 {
 700         u32 score = jiffies - rt->dst.lastuse;
 701
 702         score = ~score & ~(3<<30);
 703
 704         if (rt_valuable(rt))
 705                 score |= (1<<31);
 706
 707         if (rt_is_output_route(rt) ||
 708             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709                 score |= (1<<30);
 710
 711         return score;
 712 }
 713
 714 static inline bool rt_caching(const struct net *net)
 715 {
 716         return net->ipv4.current_rt_cache_rebuild_count <=
 717                 net->ipv4.sysctl_rt_cache_rebuild_count;
 718 }
 719
 720 static inline bool compare_hash_inputs(const struct rtable *rt1,
 721                                        const struct rtable *rt2)
 722 {
 723         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 724                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 725                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 726 }
 727
 728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 729 {
 730         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                 (rt1->rt_mark ^ rt2->rt_mark) |
 733                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 735                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 736 }
 737
 738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739 {
 740         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 741 }
 742
 743 static inline int rt_is_expired(struct rtable *rth)
 744 {
 745         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 746 }
 747
 748 /*
 749  * Perform a full scan of hash table and free all entries.
 750  * Can be called by a softirq or a process.
 751  * In the later case, we want to be reschedule if necessary
 752  */
 753 static void rt_do_flush(struct net *net, int process_context)
 754 {
 755         unsigned int i;
 756         struct rtable *rth, *next;
 757
 758         for (i = 0; i <= rt_hash_mask; i++) {
 759                 struct rtable __rcu **pprev;
 760                 struct rtable *list;
 761
 762                 if (process_context && need_resched())
 763                         cond_resched();
 764                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 765                 if (!rth)
 766                         continue;
 767
 768                 spin_lock_bh(rt_hash_lock_addr(i));
 769
 770                 list = NULL;
 771                 pprev = &rt_hash_table[i].chain;
 772                 rth = rcu_dereference_protected(*pprev,
 773                         lockdep_is_held(rt_hash_lock_addr(i)));
 774
 775                 while (rth) {
 776                         next = rcu_dereference_protected(rth->dst.rt_next,
 777                                 lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                         if (!net ||
 780                             net_eq(dev_net(rth->dst.dev), net)) {
 781                                 rcu_assign_pointer(*pprev, next);
 782                                 rcu_assign_pointer(rth->dst.rt_next, list);
 783                                 list = rth;
 784                         } else {
 785                                 pprev = &rth->dst.rt_next;
 786                         }
 787                         rth = next;
 788                 }
 789
 790                 spin_unlock_bh(rt_hash_lock_addr(i));
 791
 792                 for (; list; list = next) {
 793                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 794                         rt_free(list);
 795                 }
 796         }
 797 }
 798
 799 /*
 800  * While freeing expired entries, we compute average chain length
 801  * and standard deviation, using fixed-point arithmetic.
 802  * This to have an estimation of rt_chain_length_max
 803  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 804  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 805  */
 806
 807 #define FRACT_BITS 3
 808 #define ONE (1UL << FRACT_BITS)
 809
 810 /*
 811  * Given a hash chain and an item in this hash chain,
 812  * find if a previous entry has the same hash_inputs
 813  * (but differs on tos, mark or oif)
 814  * Returns 0 if an alias is found.
 815  * Returns ONE if rth has no alias before itself.
 816  */
 817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 818 {
 819         const struct rtable *aux = head;
 820
 821         while (aux != rth) {
 822                 if (compare_hash_inputs(aux, rth))
 823                         return 0;
 824                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 825         }
 826         return ONE;
 827 }
 828
 829 static void rt_check_expire(void)
 830 {
 831         static unsigned int rover;
 832         unsigned int i = rover, goal;
 833         struct rtable *rth;
 834         struct rtable __rcu **rthp;
 835         unsigned long samples = 0;
 836         unsigned long sum = 0, sum2 = 0;
 837         unsigned long delta;
 838         u64 mult;
 839
 840         delta = jiffies - expires_ljiffies;
 841         expires_ljiffies = jiffies;
 842         mult = ((u64)delta) << rt_hash_log;
 843         if (ip_rt_gc_timeout > 1)
 844                 do_div(mult, ip_rt_gc_timeout);
 845         goal = (unsigned int)mult;
 846         if (goal > rt_hash_mask)
 847                 goal = rt_hash_mask + 1;
 848         for (; goal > 0; goal--) {
 849                 unsigned long tmo = ip_rt_gc_timeout;
 850                 unsigned long length;
 851
 852                 i = (i + 1) & rt_hash_mask;
 853                 rthp = &rt_hash_table[i].chain;
 854
 855                 if (need_resched())
 856                         cond_resched();
 857
 858                 samples++;
 859
 860                 if (rcu_dereference_raw(*rthp) == NULL)
 861                         continue;
 862                 length = 0;
 863                 spin_lock_bh(rt_hash_lock_addr(i));
 864                 while ((rth = rcu_dereference_protected(*rthp,
 865                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 866                         prefetch(rth->dst.rt_next);
 867                         if (rt_is_expired(rth) ||
 868                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 869                                 *rthp = rth->dst.rt_next;
 870                                 rt_free(rth);
 871                                 continue;
 872                         }
 873
 874                         /* We only count entries on a chain with equal
 875                          * hash inputs once so that entries for
 876                          * different QOS levels, and other non-hash
 877                          * input attributes don't unfairly skew the
 878                          * length computation
 879                          */
 880                         tmo >>= 1;
 881                         rthp = &rth->dst.rt_next;
 882                         length += has_noalias(rt_hash_table[i].chain, rth);
 883                 }
 884                 spin_unlock_bh(rt_hash_lock_addr(i));
 885                 sum += length;
 886                 sum2 += length*length;
 887         }
 888         if (samples) {
 889                 unsigned long avg = sum / samples;
 890                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 891                 rt_chain_length_max = max_t(unsigned long,
 892                                         ip_rt_gc_elasticity,
 893                                         (avg + 4*sd) >> FRACT_BITS);
 894         }
 895         rover = i;
 896 }
 897
 898 /*
 899  * rt_worker_func() is run in process context.
 900  * we call rt_check_expire() to scan part of the hash table
 901  */
 902 static void rt_worker_func(struct work_struct *work)
 903 {
 904         rt_check_expire();
 905         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 906 }
 907
 908 /*
 909  * Perturbation of rt_genid by a small quantity [1..256]
 910  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 911  * many times (2^24) without giving recent rt_genid.
 912  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 913  */
 914 static void rt_cache_invalidate(struct net *net)
 915 {
 916         unsigned char shuffle;
 917
 918         get_random_bytes(&shuffle, sizeof(shuffle));
 919         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 920         inetpeer_invalidate_family(AF_INET);
 921 }
 922
 923 /*
 924  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 925  * delay >= 0 : invalidate & flush cache (can be long)
 926  */
 927 void rt_cache_flush(struct net *net, int delay)
 928 {
 929         rt_cache_invalidate(net);
 930         if (delay >= 0)
 931                 rt_do_flush(net, !in_softirq());
 932 }
 933
 934 /* Flush previous cache invalidated entries from the cache */
 935 void rt_cache_flush_batch(struct net *net)
 936 {
 937         rt_do_flush(net, !in_softirq());
 938 }
 939
 940 static void rt_emergency_hash_rebuild(struct net *net)
 941 {
 942         net_warn_ratelimited("Route hash chain too long!\n");
 943         rt_cache_invalidate(net);
 944 }
 945
 946 /*
 947    Short description of GC goals.
 948
 949    We want to build algorithm, which will keep routing cache
 950    at some equilibrium point, when number of aged off entries
 951    is kept approximately equal to newly generated ones.
 952
 953    Current expiration strength is variable "expire".
 954    We try to adjust it dynamically, so that if networking
 955    is idle expires is large enough to keep enough of warm entries,
 956    and when load increases it reduces to limit cache size.
 957  */
 958
 959 static int rt_garbage_collect(struct dst_ops *ops)
 960 {
 961         static unsigned long expire = RT_GC_TIMEOUT;
 962         static unsigned long last_gc;
 963         static int rover;
 964         static int equilibrium;
 965         struct rtable *rth;
 966         struct rtable __rcu **rthp;
 967         unsigned long now = jiffies;
 968         int goal;
 969         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 970
 971         /*
 972          * Garbage collection is pretty expensive,
 973          * do not make it too frequently.
 974          */
 975
 976         RT_CACHE_STAT_INC(gc_total);
 977
 978         if (now - last_gc < ip_rt_gc_min_interval &&
 979             entries < ip_rt_max_size) {
 980                 RT_CACHE_STAT_INC(gc_ignored);
 981                 goto out;
 982         }
 983
 984         entries = dst_entries_get_slow(&ipv4_dst_ops);
 985         /* Calculate number of entries, which we want to expire now. */
 986         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 987         if (goal <= 0) {
 988                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 989                         equilibrium = ipv4_dst_ops.gc_thresh;
 990                 goal = entries - equilibrium;
 991                 if (goal > 0) {
 992                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 993                         goal = entries - equilibrium;
 994                 }
 995         } else {
 996                 /* We are in dangerous area. Try to reduce cache really
 997                  * aggressively.
 998                  */
 999                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1000                 equilibrium = entries - goal;
1001         }
1002
1003         if (now - last_gc >= ip_rt_gc_min_interval)
1004                 last_gc = now;
1005
1006         if (goal <= 0) {
1007                 equilibrium += goal;
1008                 goto work_done;
1009         }
1010
1011         do {
1012                 int i, k;
1013
1014                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1015                         unsigned long tmo = expire;
1016
1017                         k = (k + 1) & rt_hash_mask;
1018                         rthp = &rt_hash_table[k].chain;
1019                         spin_lock_bh(rt_hash_lock_addr(k));
1020                         while ((rth = rcu_dereference_protected(*rthp,
1021                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1022                                 if (!rt_is_expired(rth) &&
1023                                         !rt_may_expire(rth, tmo, expire)) {
1024                                         tmo >>= 1;
1025                                         rthp = &rth->dst.rt_next;
1026                                         continue;
1027                                 }
1028                                 *rthp = rth->dst.rt_next;
1029                                 rt_free(rth);
1030                                 goal--;
1031                         }
1032                         spin_unlock_bh(rt_hash_lock_addr(k));
1033                         if (goal <= 0)
1034                                 break;
1035                 }
1036                 rover = k;
1037
1038                 if (goal <= 0)
1039                         goto work_done;
1040
1041                 /* Goal is not achieved. We stop process if:
1042
1043                    - if expire reduced to zero. Otherwise, expire is halfed.
1044                    - if table is not full.
1045                    - if we are called from interrupt.
1046                    - jiffies check is just fallback/debug loop breaker.
1047                      We will not spin here for long time in any case.
1048                  */
1049
1050                 RT_CACHE_STAT_INC(gc_goal_miss);
1051
1052                 if (expire == 0)
1053                         break;
1054
1055                 expire >>= 1;
1056
1057                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1058                         goto out;
1059         } while (!in_softirq() && time_before_eq(jiffies, now));
1060
1061         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1062                 goto out;
1063         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1064                 goto out;
1065         net_warn_ratelimited("dst cache overflow\n");
1066         RT_CACHE_STAT_INC(gc_dst_overflow);
1067         return 1;
1068
1069 work_done:
1070         expire += ip_rt_gc_min_interval;
1071         if (expire > ip_rt_gc_timeout ||
1072             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1073             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1074                 expire = ip_rt_gc_timeout;
1075 out:    return 0;
1076 }
1077
1078 /*
1079  * Returns number of entries in a hash chain that have different hash_inputs
1080  */
1081 static int slow_chain_length(const struct rtable *head)
1082 {
1083         int length = 0;
1084         const struct rtable *rth = head;
1085
1086         while (rth) {
1087                 length += has_noalias(head, rth);
1088                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1089         }
1090         return length >> FRACT_BITS;
1091 }
1092
1093 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1094                                            struct sk_buff *skb,
1095                                            const void *daddr)
1096 {
1097         struct net_device *dev = dst->dev;
1098         const __be32 *pkey = daddr;
1099         const struct rtable *rt;
1100         struct neighbour *n;
1101
1102         rt = (const struct rtable *) dst;
1103         if (rt->rt_gateway)
1104                 pkey = (const __be32 *) &rt->rt_gateway;
1105         else if (skb)
1106                 pkey = &ip_hdr(skb)->daddr;
1107
1108         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1109         if (n)
1110                 return n;
1111         return neigh_create(&arp_tbl, pkey, dev);
1112 }
1113
1114 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1115                                      struct sk_buff *skb, int ifindex)
1116 {
1117         struct rtable   *rth, *cand;
1118         struct rtable __rcu **rthp, **candp;
1119         unsigned long   now;
1120         u32             min_score;
1121         int             chain_length;
1122
1123 restart:
1124         chain_length = 0;
1125         min_score = ~(u32)0;
1126         cand = NULL;
1127         candp = NULL;
1128         now = jiffies;
1129
1130         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1131                 /*
1132                  * If we're not caching, just tell the caller we
1133                  * were successful and don't touch the route.  The
1134                  * caller hold the sole reference to the cache entry, and
1135                  * it will be released when the caller is done with it.
1136                  * If we drop it here, the callers have no way to resolve routes
1137                  * when we're not caching.  Instead, just point *rp at rt, so
1138                  * the caller gets a single use out of the route
1139                  * Note that we do rt_free on this new route entry, so that
1140                  * once its refcount hits zero, we are still able to reap it
1141                  * (Thanks Alexey)
1142                  * Note: To avoid expensive rcu stuff for this uncached dst,
1143                  * we set DST_NOCACHE so that dst_release() can free dst without
1144                  * waiting a grace period.
1145                  */
1146
1147                 rt->dst.flags |= DST_NOCACHE;
1148                 goto skip_hashing;
1149         }
1150
1151         rthp = &rt_hash_table[hash].chain;
1152
1153         spin_lock_bh(rt_hash_lock_addr(hash));
1154         while ((rth = rcu_dereference_protected(*rthp,
1155                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1156                 if (rt_is_expired(rth)) {
1157                         *rthp = rth->dst.rt_next;
1158                         rt_free(rth);
1159                         continue;
1160                 }
1161                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1162                         /* Put it first */
1163                         *rthp = rth->dst.rt_next;
1164                         /*
1165                          * Since lookup is lockfree, the deletion
1166                          * must be visible to another weakly ordered CPU before
1167                          * the insertion at the start of the hash chain.
1168                          */
1169                         rcu_assign_pointer(rth->dst.rt_next,
1170                                            rt_hash_table[hash].chain);
1171                         /*
1172                          * Since lookup is lockfree, the update writes
1173                          * must be ordered for consistency on SMP.
1174                          */
1175                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1176
1177                         dst_use(&rth->dst, now);
1178                         spin_unlock_bh(rt_hash_lock_addr(hash));
1179
1180                         rt_drop(rt);
1181                         if (skb)
1182                                 skb_dst_set(skb, &rth->dst);
1183                         return rth;
1184                 }
1185
1186                 if (!atomic_read(&rth->dst.__refcnt)) {
1187                         u32 score = rt_score(rth);
1188
1189                         if (score <= min_score) {
1190                                 cand = rth;
1191                                 candp = rthp;
1192                                 min_score = score;
1193                         }
1194                 }
1195
1196                 chain_length++;
1197
1198                 rthp = &rth->dst.rt_next;
1199         }
1200
1201         if (cand) {
1202                 /* ip_rt_gc_elasticity used to be average length of chain
1203                  * length, when exceeded gc becomes really aggressive.
1204                  *
1205                  * The second limit is less certain. At the moment it allows
1206                  * only 2 entries per bucket. We will see.
1207                  */
1208                 if (chain_length > ip_rt_gc_elasticity) {
1209                         *candp = cand->dst.rt_next;
1210                         rt_free(cand);
1211                 }
1212         } else {
1213                 if (chain_length > rt_chain_length_max &&
1214                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1215                         struct net *net = dev_net(rt->dst.dev);
1216                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1217                         if (!rt_caching(net)) {
1218                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1219                                         rt->dst.dev->name, num);
1220                         }
1221                         rt_emergency_hash_rebuild(net);
1222                         spin_unlock_bh(rt_hash_lock_addr(hash));
1223
1224                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1225                                         ifindex, rt_genid(net));
1226                         goto restart;
1227                 }
1228         }
1229
1230         rt->dst.rt_next = rt_hash_table[hash].chain;
1231
1232         /*
1233          * Since lookup is lockfree, we must make sure
1234          * previous writes to rt are committed to memory
1235          * before making rt visible to other CPUS.
1236          */
1237         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1238
1239         spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241 skip_hashing:
1242         if (skb)
1243                 skb_dst_set(skb, &rt->dst);
1244         return rt;
1245 }
1246
1247 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1248
1249 static u32 rt_peer_genid(void)
1250 {
1251         return atomic_read(&__rt_peer_genid);
1252 }
1253
1254 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1255 {
1256         struct inet_peer_base *base;
1257         struct inet_peer *peer;
1258
1259         base = inetpeer_base_ptr(rt->_peer);
1260         if (!base)
1261                 return;
1262
1263         peer = inet_getpeer_v4(base, daddr, create);
1264         if (peer) {
1265                 if (!rt_set_peer(rt, peer))
1266                         inet_putpeer(peer);
1267                 else
1268                         rt->rt_peer_genid = rt_peer_genid();
1269         }
1270 }
1271
1272 /*
1273  * Peer allocation may fail only in serious out-of-memory conditions.  However
1274  * we still can generate some output.
1275  * Random ID selection looks a bit dangerous because we have no chances to
1276  * select ID being unique in a reasonable period of time.
1277  * But broken packet identifier may be better than no packet at all.
1278  */
1279 static void ip_select_fb_ident(struct iphdr *iph)
1280 {
1281         static DEFINE_SPINLOCK(ip_fb_id_lock);
1282         static u32 ip_fallback_id;
1283         u32 salt;
1284
1285         spin_lock_bh(&ip_fb_id_lock);
1286         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1287         iph->id = htons(salt & 0xFFFF);
1288         ip_fallback_id = salt;
1289         spin_unlock_bh(&ip_fb_id_lock);
1290 }
1291
1292 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1293 {
1294         struct rtable *rt = (struct rtable *) dst;
1295
1296         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1297                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1298
1299                 /* If peer is attached to destination, it is never detached,
1300                    so that we need not to grab a lock to dereference it.
1301                  */
1302                 if (peer) {
1303                         iph->id = htons(inet_getid(peer, more));
1304                         return;
1305                 }
1306         } else if (!rt)
1307                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1308
1309         ip_select_fb_ident(iph);
1310 }
1311 EXPORT_SYMBOL(__ip_select_ident);
1312
1313 static void rt_del(unsigned int hash, struct rtable *rt)
1314 {
1315         struct rtable __rcu **rthp;
1316         struct rtable *aux;
1317
1318         rthp = &rt_hash_table[hash].chain;
1319         spin_lock_bh(rt_hash_lock_addr(hash));
1320         ip_rt_put(rt);
1321         while ((aux = rcu_dereference_protected(*rthp,
1322                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1323                 if (aux == rt || rt_is_expired(aux)) {
1324                         *rthp = aux->dst.rt_next;
1325                         rt_free(aux);
1326                         continue;
1327                 }
1328                 rthp = &aux->dst.rt_next;
1329         }
1330         spin_unlock_bh(rt_hash_lock_addr(hash));
1331 }
1332
1333 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1334 {
1335         struct rtable *rt = (struct rtable *) dst;
1336         __be32 orig_gw = rt->rt_gateway;
1337         struct neighbour *n;
1338
1339         dst_confirm(&rt->dst);
1340
1341         rt->rt_gateway = peer->redirect_learned.a4;
1342
1343         n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
1344         if (!n) {
1345                 rt->rt_gateway = orig_gw;
1346                 return;
1347         }
1348         if (!(n->nud_state & NUD_VALID)) {
1349                 neigh_event_send(n, NULL);
1350         } else {
1351                 rt->rt_flags |= RTCF_REDIRECTED;
1352                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1353         }
1354         neigh_release(n);
1355 }
1356
1357 /* called in rcu_read_lock() section */
1358 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1359                     __be32 saddr, struct net_device *dev)
1360 {
1361         int s, i;
1362         struct in_device *in_dev = __in_dev_get_rcu(dev);
1363         __be32 skeys[2] = { saddr, 0 };
1364         int    ikeys[2] = { dev->ifindex, 0 };
1365         struct inet_peer *peer;
1366         struct net *net;
1367
1368         if (!in_dev)
1369                 return;
1370
1371         net = dev_net(dev);
1372         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1373             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1374             ipv4_is_zeronet(new_gw))
1375                 goto reject_redirect;
1376
1377         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1378                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1379                         goto reject_redirect;
1380                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1381                         goto reject_redirect;
1382         } else {
1383                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1384                         goto reject_redirect;
1385         }
1386
1387         for (s = 0; s < 2; s++) {
1388                 for (i = 0; i < 2; i++) {
1389                         unsigned int hash;
1390                         struct rtable __rcu **rthp;
1391                         struct rtable *rt;
1392
1393                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1394
1395                         rthp = &rt_hash_table[hash].chain;
1396
1397                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1398                                 rthp = &rt->dst.rt_next;
1399
1400                                 if (rt->rt_key_dst != daddr ||
1401                                     rt->rt_key_src != skeys[s] ||
1402                                     rt->rt_oif != ikeys[i] ||
1403                                     rt_is_input_route(rt) ||
1404                                     rt_is_expired(rt) ||
1405                                     !net_eq(dev_net(rt->dst.dev), net) ||
1406                                     rt->dst.error ||
1407                                     rt->dst.dev != dev ||
1408                                     rt->rt_gateway != old_gw)
1409                                         continue;
1410
1411                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1412                                 if (peer) {
1413                                         if (peer->redirect_learned.a4 != new_gw) {
1414                                                 peer->redirect_learned.a4 = new_gw;
1415                                                 atomic_inc(&__rt_peer_genid);
1416                                         }
1417                                         check_peer_redir(&rt->dst, peer);
1418                                 }
1419                         }
1420                 }
1421         }
1422         return;
1423
1424 reject_redirect:
1425 #ifdef CONFIG_IP_ROUTE_VERBOSE
1426         if (IN_DEV_LOG_MARTIANS(in_dev))
1427                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1428                                      "  Advised path = %pI4 -> %pI4\n",
1429                                      &old_gw, dev->name, &new_gw,
1430                                      &saddr, &daddr);
1431 #endif
1432         ;
1433 }
1434
1435 static bool peer_pmtu_expired(struct inet_peer *peer)
1436 {
1437         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1438
1439         return orig &&
1440                time_after_eq(jiffies, orig) &&
1441                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1442 }
1443
1444 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1445 {
1446         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1447
1448         return orig &&
1449                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1450 }
1451
1452 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1453 {
1454         struct rtable *rt = (struct rtable *)dst;
1455         struct dst_entry *ret = dst;
1456
1457         if (rt) {
1458                 if (dst->obsolete > 0) {
1459                         ip_rt_put(rt);
1460                         ret = NULL;
1461                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1462                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1463                                                 rt->rt_oif,
1464                                                 rt_genid(dev_net(dst->dev)));
1465                         rt_del(hash, rt);
1466                         ret = NULL;
1467                 } else if (rt_has_peer(rt)) {
1468                         struct inet_peer *peer = rt_peer_ptr(rt);
1469                         if (peer_pmtu_expired(peer))
1470                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1471                 }
1472         }
1473         return ret;
1474 }
1475
1476 /*
1477  * Algorithm:
1478  *      1. The first ip_rt_redirect_number redirects are sent
1479  *         with exponential backoff, then we stop sending them at all,
1480  *         assuming that the host ignores our redirects.
1481  *      2. If we did not see packets requiring redirects
1482  *         during ip_rt_redirect_silence, we assume that the host
1483  *         forgot redirected route and start to send redirects again.
1484  *
1485  * This algorithm is much cheaper and more intelligent than dumb load limiting
1486  * in icmp.c.
1487  *
1488  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1489  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1490  */
1491
1492 void ip_rt_send_redirect(struct sk_buff *skb)
1493 {
1494         struct rtable *rt = skb_rtable(skb);
1495         struct in_device *in_dev;
1496         struct inet_peer *peer;
1497         int log_martians;
1498
1499         rcu_read_lock();
1500         in_dev = __in_dev_get_rcu(rt->dst.dev);
1501         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1502                 rcu_read_unlock();
1503                 return;
1504         }
1505         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1506         rcu_read_unlock();
1507
1508         peer = rt_get_peer_create(rt, rt->rt_dst);
1509         if (!peer) {
1510                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1511                 return;
1512         }
1513
1514         /* No redirected packets during ip_rt_redirect_silence;
1515          * reset the algorithm.
1516          */
1517         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1518                 peer->rate_tokens = 0;
1519
1520         /* Too many ignored redirects; do not send anything
1521          * set dst.rate_last to the last seen redirected packet.
1522          */
1523         if (peer->rate_tokens >= ip_rt_redirect_number) {
1524                 peer->rate_last = jiffies;
1525                 return;
1526         }
1527
1528         /* Check for load limit; set rate_last to the latest sent
1529          * redirect.
1530          */
1531         if (peer->rate_tokens == 0 ||
1532             time_after(jiffies,
1533                        (peer->rate_last +
1534                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1535                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1536                 peer->rate_last = jiffies;
1537                 ++peer->rate_tokens;
1538 #ifdef CONFIG_IP_ROUTE_VERBOSE
1539                 if (log_martians &&
1540                     peer->rate_tokens == ip_rt_redirect_number)
1541                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1542                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1543                                              &rt->rt_dst, &rt->rt_gateway);
1544 #endif
1545         }
1546 }
1547
1548 static int ip_error(struct sk_buff *skb)
1549 {
1550         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1551         struct rtable *rt = skb_rtable(skb);
1552         struct inet_peer *peer;
1553         unsigned long now;
1554         struct net *net;
1555         bool send;
1556         int code;
1557
1558         net = dev_net(rt->dst.dev);
1559         if (!IN_DEV_FORWARD(in_dev)) {
1560                 switch (rt->dst.error) {
1561                 case EHOSTUNREACH:
1562                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1563                         break;
1564
1565                 case ENETUNREACH:
1566                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1567                         break;
1568                 }
1569                 goto out;
1570         }
1571
1572         switch (rt->dst.error) {
1573         case EINVAL:
1574         default:
1575                 goto out;
1576         case EHOSTUNREACH:
1577                 code = ICMP_HOST_UNREACH;
1578                 break;
1579         case ENETUNREACH:
1580                 code = ICMP_NET_UNREACH;
1581                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1582                 break;
1583         case EACCES:
1584                 code = ICMP_PKT_FILTERED;
1585                 break;
1586         }
1587
1588         peer = rt_get_peer_create(rt, rt->rt_dst);
1589
1590         send = true;
1591         if (peer) {
1592                 now = jiffies;
1593                 peer->rate_tokens += now - peer->rate_last;
1594                 if (peer->rate_tokens > ip_rt_error_burst)
1595                         peer->rate_tokens = ip_rt_error_burst;
1596                 peer->rate_last = now;
1597                 if (peer->rate_tokens >= ip_rt_error_cost)
1598                         peer->rate_tokens -= ip_rt_error_cost;
1599                 else
1600                         send = false;
1601         }
1602         if (send)
1603                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1604
1605 out:    kfree_skb(skb);
1606         return 0;
1607 }
1608
1609 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1610 {
1611         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1612
1613         if (!expires)
1614                 return;
1615         if (time_before(jiffies, expires)) {
1616                 u32 orig_dst_mtu = dst_mtu(dst);
1617                 if (peer->pmtu_learned < orig_dst_mtu) {
1618                         if (!peer->pmtu_orig)
1619                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1620                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1621                 }
1622         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1623                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1624 }
1625
1626 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1627 {
1628         struct rtable *rt = (struct rtable *) dst;
1629         struct inet_peer *peer;
1630
1631         dst_confirm(dst);
1632
1633         peer = rt_get_peer_create(rt, rt->rt_dst);
1634         if (peer) {
1635                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1636
1637                 if (mtu < ip_rt_min_pmtu)
1638                         mtu = ip_rt_min_pmtu;
1639                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1640
1641                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1642                         if (!pmtu_expires)
1643                                 pmtu_expires = 1UL;
1644
1645                         peer->pmtu_learned = mtu;
1646                         peer->pmtu_expires = pmtu_expires;
1647
1648                         atomic_inc(&__rt_peer_genid);
1649                         rt->rt_peer_genid = rt_peer_genid();
1650                 }
1651                 check_peer_pmtu(dst, peer);
1652         }
1653 }
1654
1655 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1656                       int oif, u32 mark, u8 protocol, int flow_flags)
1657 {
1658         const struct iphdr *iph = (const struct iphdr *)skb->data;
1659         struct flowi4 fl4;
1660         struct rtable *rt;
1661
1662         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1663                            protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
1664                            iph->daddr, iph->saddr, 0, 0);
1665         rt = __ip_route_output_key(net, &fl4);
1666         if (!IS_ERR(rt)) {
1667                 ip_rt_update_pmtu(&rt->dst, mtu);
1668                 ip_rt_put(rt);
1669         }
1670 }
1671 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1672
1673 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1674 {
1675         const struct inet_sock *inet = inet_sk(sk);
1676
1677         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1678                                 sk->sk_bound_dev_if, sk->sk_mark,
1679                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1680                                 inet_sk_flowi_flags(sk));
1681 }
1682 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1683
1684 static void ipv4_validate_peer(struct rtable *rt)
1685 {
1686         if (rt->rt_peer_genid != rt_peer_genid()) {
1687                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1688
1689                 if (peer) {
1690                         check_peer_pmtu(&rt->dst, peer);
1691
1692                         if (peer->redirect_learned.a4 &&
1693                             peer->redirect_learned.a4 != rt->rt_gateway)
1694                                 check_peer_redir(&rt->dst, peer);
1695                 }
1696
1697                 rt->rt_peer_genid = rt_peer_genid();
1698         }
1699 }
1700
1701 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1702 {
1703         struct rtable *rt = (struct rtable *) dst;
1704
1705         if (rt_is_expired(rt))
1706                 return NULL;
1707         ipv4_validate_peer(rt);
1708         return dst;
1709 }
1710
1711 static void ipv4_dst_destroy(struct dst_entry *dst)
1712 {
1713         struct rtable *rt = (struct rtable *) dst;
1714
1715         if (rt->fi) {
1716                 fib_info_put(rt->fi);
1717                 rt->fi = NULL;
1718         }
1719         if (rt_has_peer(rt)) {
1720                 struct inet_peer *peer = rt_peer_ptr(rt);
1721                 inet_putpeer(peer);
1722         }
1723 }
1724
1725
1726 static void ipv4_link_failure(struct sk_buff *skb)
1727 {
1728         struct rtable *rt;
1729
1730         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1731
1732         rt = skb_rtable(skb);
1733         if (rt && rt_has_peer(rt)) {
1734                 struct inet_peer *peer = rt_peer_ptr(rt);
1735                 if (peer_pmtu_cleaned(peer))
1736                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1737         }
1738 }
1739
1740 static int ip_rt_bug(struct sk_buff *skb)
1741 {
1742         pr_debug("%s: %pI4 -> %pI4, %s\n",
1743                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1744                  skb->dev ? skb->dev->name : "?");
1745         kfree_skb(skb);
1746         WARN_ON(1);
1747         return 0;
1748 }
1749
1750 /*
1751    We do not cache source address of outgoing interface,
1752    because it is used only by IP RR, TS and SRR options,
1753    so that it out of fast path.
1754
1755    BTW remember: "addr" is allowed to be not aligned
1756    in IP options!
1757  */
1758
1759 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1760 {
1761         __be32 src;
1762
1763         if (rt_is_output_route(rt))
1764                 src = ip_hdr(skb)->saddr;
1765         else {
1766                 struct fib_result res;
1767                 struct flowi4 fl4;
1768                 struct iphdr *iph;
1769
1770                 iph = ip_hdr(skb);
1771
1772                 memset(&fl4, 0, sizeof(fl4));
1773                 fl4.daddr = iph->daddr;
1774                 fl4.saddr = iph->saddr;
1775                 fl4.flowi4_tos = RT_TOS(iph->tos);
1776                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1777                 fl4.flowi4_iif = skb->dev->ifindex;
1778                 fl4.flowi4_mark = skb->mark;
1779
1780                 rcu_read_lock();
1781                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1782                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1783                 else
1784                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1785                                         RT_SCOPE_UNIVERSE);
1786                 rcu_read_unlock();
1787         }
1788         memcpy(addr, &src, 4);
1789 }
1790
1791 #ifdef CONFIG_IP_ROUTE_CLASSID
1792 static void set_class_tag(struct rtable *rt, u32 tag)
1793 {
1794         if (!(rt->dst.tclassid & 0xFFFF))
1795                 rt->dst.tclassid |= tag & 0xFFFF;
1796         if (!(rt->dst.tclassid & 0xFFFF0000))
1797                 rt->dst.tclassid |= tag & 0xFFFF0000;
1798 }
1799 #endif
1800
1801 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1802 {
1803         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1804
1805         if (advmss == 0) {
1806                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1807                                ip_rt_min_advmss);
1808                 if (advmss > 65535 - 40)
1809                         advmss = 65535 - 40;
1810         }
1811         return advmss;
1812 }
1813
1814 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1815 {
1816         const struct rtable *rt = (const struct rtable *) dst;
1817         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1818
1819         if (mtu && rt_is_output_route(rt))
1820                 return mtu;
1821
1822         mtu = dst->dev->mtu;
1823
1824         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1825
1826                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1827                         mtu = 576;
1828         }
1829
1830         if (mtu > IP_MAX_MTU)
1831                 mtu = IP_MAX_MTU;
1832
1833         return mtu;
1834 }
1835
1836 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1837                             struct fib_info *fi)
1838 {
1839         struct inet_peer_base *base;
1840         struct inet_peer *peer;
1841         int create = 0;
1842
1843         /* If a peer entry exists for this destination, we must hook
1844          * it up in order to get at cached metrics.
1845          */
1846         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1847                 create = 1;
1848
1849         base = inetpeer_base_ptr(rt->_peer);
1850         BUG_ON(!base);
1851
1852         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1853         if (peer) {
1854                 __rt_set_peer(rt, peer);
1855                 rt->rt_peer_genid = rt_peer_genid();
1856                 if (inet_metrics_new(peer))
1857                         memcpy(peer->metrics, fi->fib_metrics,
1858                                sizeof(u32) * RTAX_MAX);
1859                 dst_init_metrics(&rt->dst, peer->metrics, false);
1860
1861                 check_peer_pmtu(&rt->dst, peer);
1862
1863                 if (peer->redirect_learned.a4 &&
1864                     peer->redirect_learned.a4 != rt->rt_gateway) {
1865                         rt->rt_gateway = peer->redirect_learned.a4;
1866                         rt->rt_flags |= RTCF_REDIRECTED;
1867                 }
1868         } else {
1869                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1870                         rt->fi = fi;
1871                         atomic_inc(&fi->fib_clntref);
1872                 }
1873                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1874         }
1875 }
1876
1877 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1878                            const struct fib_result *res,
1879                            struct fib_info *fi, u16 type, u32 itag)
1880 {
1881         struct dst_entry *dst = &rt->dst;
1882
1883         if (fi) {
1884                 if (FIB_RES_GW(*res) &&
1885                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1886                         rt->rt_gateway = FIB_RES_GW(*res);
1887                 rt_init_metrics(rt, fl4, fi);
1888 #ifdef CONFIG_IP_ROUTE_CLASSID
1889                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1890 #endif
1891         }
1892
1893         if (dst_mtu(dst) > IP_MAX_MTU)
1894                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1895
1896 #ifdef CONFIG_IP_ROUTE_CLASSID
1897 #ifdef CONFIG_IP_MULTIPLE_TABLES
1898         set_class_tag(rt, fib_rules_tclass(res));
1899 #endif
1900         set_class_tag(rt, itag);
1901 #endif
1902 }
1903
1904 static struct rtable *rt_dst_alloc(struct net_device *dev,
1905                                    bool nopolicy, bool noxfrm)
1906 {
1907         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1908                          DST_HOST |
1909                          (nopolicy ? DST_NOPOLICY : 0) |
1910                          (noxfrm ? DST_NOXFRM : 0));
1911 }
1912
1913 /* called in rcu_read_lock() section */
1914 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1915                                 u8 tos, struct net_device *dev, int our)
1916 {
1917         unsigned int hash;
1918         struct rtable *rth;
1919         struct in_device *in_dev = __in_dev_get_rcu(dev);
1920         u32 itag = 0;
1921         int err;
1922
1923         /* Primary sanity checks. */
1924
1925         if (in_dev == NULL)
1926                 return -EINVAL;
1927
1928         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1929             skb->protocol != htons(ETH_P_IP))
1930                 goto e_inval;
1931
1932         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1933                 if (ipv4_is_loopback(saddr))
1934                         goto e_inval;
1935
1936         if (ipv4_is_zeronet(saddr)) {
1937                 if (!ipv4_is_local_multicast(daddr))
1938                         goto e_inval;
1939         } else {
1940                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1941                                           in_dev, &itag);
1942                 if (err < 0)
1943                         goto e_err;
1944         }
1945         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1946                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1947         if (!rth)
1948                 goto e_nobufs;
1949
1950 #ifdef CONFIG_IP_ROUTE_CLASSID
1951         rth->dst.tclassid = itag;
1952 #endif
1953         rth->dst.output = ip_rt_bug;
1954
1955         rth->rt_key_dst = daddr;
1956         rth->rt_key_src = saddr;
1957         rth->rt_genid   = rt_genid(dev_net(dev));
1958         rth->rt_flags   = RTCF_MULTICAST;
1959         rth->rt_type    = RTN_MULTICAST;
1960         rth->rt_key_tos = tos;
1961         rth->rt_dst     = daddr;
1962         rth->rt_src     = saddr;
1963         rth->rt_route_iif = dev->ifindex;
1964         rth->rt_iif     = dev->ifindex;
1965         rth->rt_oif     = 0;
1966         rth->rt_mark    = skb->mark;
1967         rth->rt_gateway = daddr;
1968         rth->rt_peer_genid = 0;
1969         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
1970         rth->fi = NULL;
1971         if (our) {
1972                 rth->dst.input= ip_local_deliver;
1973                 rth->rt_flags |= RTCF_LOCAL;
1974         }
1975
1976 #ifdef CONFIG_IP_MROUTE
1977         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1978                 rth->dst.input = ip_mr_input;
1979 #endif
1980         RT_CACHE_STAT_INC(in_slow_mc);
1981
1982         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1983         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1984         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1985
1986 e_nobufs:
1987         return -ENOBUFS;
1988 e_inval:
1989         return -EINVAL;
1990 e_err:
1991         return err;
1992 }
1993
1994
1995 static void ip_handle_martian_source(struct net_device *dev,
1996                                      struct in_device *in_dev,
1997                                      struct sk_buff *skb,
1998                                      __be32 daddr,
1999                                      __be32 saddr)
2000 {
2001         RT_CACHE_STAT_INC(in_martian_src);
2002 #ifdef CONFIG_IP_ROUTE_VERBOSE
2003         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2004                 /*
2005                  *      RFC1812 recommendation, if source is martian,
2006                  *      the only hint is MAC header.
2007                  */
2008                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2009                         &daddr, &saddr, dev->name);
2010                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2011                         print_hex_dump(KERN_WARNING, "ll header: ",
2012                                        DUMP_PREFIX_OFFSET, 16, 1,
2013                                        skb_mac_header(skb),
2014                                        dev->hard_header_len, true);
2015                 }
2016         }
2017 #endif
2018 }
2019
2020 /* called in rcu_read_lock() section */
2021 static int __mkroute_input(struct sk_buff *skb,
2022                            const struct fib_result *res,
2023                            struct in_device *in_dev,
2024                            __be32 daddr, __be32 saddr, u32 tos,
2025                            struct rtable **result)
2026 {
2027         struct rtable *rth;
2028         int err;
2029         struct in_device *out_dev;
2030         unsigned int flags = 0;
2031         u32 itag;
2032
2033         /* get a working reference to the output device */
2034         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2035         if (out_dev == NULL) {
2036                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2037                 return -EINVAL;
2038         }
2039
2040
2041         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2042                                   in_dev->dev, in_dev, &itag);
2043         if (err < 0) {
2044                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2045                                          saddr);
2046
2047                 goto cleanup;
2048         }
2049
2050         if (err)
2051                 flags |= RTCF_DIRECTSRC;
2052
2053         if (out_dev == in_dev && err &&
2054             (IN_DEV_SHARED_MEDIA(out_dev) ||
2055              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2056                 flags |= RTCF_DOREDIRECT;
2057
2058         if (skb->protocol != htons(ETH_P_IP)) {
2059                 /* Not IP (i.e. ARP). Do not create route, if it is
2060                  * invalid for proxy arp. DNAT routes are always valid.
2061                  *
2062                  * Proxy arp feature have been extended to allow, ARP
2063                  * replies back to the same interface, to support
2064                  * Private VLAN switch technologies. See arp.c.
2065                  */
2066                 if (out_dev == in_dev &&
2067                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2068                         err = -EINVAL;
2069                         goto cleanup;
2070                 }
2071         }
2072
2073         rth = rt_dst_alloc(out_dev->dev,
2074                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2075                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2076         if (!rth) {
2077                 err = -ENOBUFS;
2078                 goto cleanup;
2079         }
2080
2081         rth->rt_key_dst = daddr;
2082         rth->rt_key_src = saddr;
2083         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2084         rth->rt_flags = flags;
2085         rth->rt_type = res->type;
2086         rth->rt_key_tos = tos;
2087         rth->rt_dst     = daddr;
2088         rth->rt_src     = saddr;
2089         rth->rt_route_iif = in_dev->dev->ifindex;
2090         rth->rt_iif     = in_dev->dev->ifindex;
2091         rth->rt_oif     = 0;
2092         rth->rt_mark    = skb->mark;
2093         rth->rt_gateway = daddr;
2094         rth->rt_peer_genid = 0;
2095         rt_init_peer(rth, &res->table->tb_peers);
2096         rth->fi = NULL;
2097
2098         rth->dst.input = ip_forward;
2099         rth->dst.output = ip_output;
2100
2101         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2102
2103         *result = rth;
2104         err = 0;
2105  cleanup:
2106         return err;
2107 }
2108
2109 static int ip_mkroute_input(struct sk_buff *skb,
2110                             struct fib_result *res,
2111                             const struct flowi4 *fl4,
2112                             struct in_device *in_dev,
2113                             __be32 daddr, __be32 saddr, u32 tos)
2114 {
2115         struct rtable *rth = NULL;
2116         int err;
2117         unsigned int hash;
2118
2119 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2120         if (res->fi && res->fi->fib_nhs > 1)
2121                 fib_select_multipath(res);
2122 #endif
2123
2124         /* create a routing cache entry */
2125         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2126         if (err)
2127                 return err;
2128
2129         /* put it into the cache */
2130         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2131                        rt_genid(dev_net(rth->dst.dev)));
2132         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2133         if (IS_ERR(rth))
2134                 return PTR_ERR(rth);
2135         return 0;
2136 }
2137
2138 /*
2139  *      NOTE. We drop all the packets that has local source
2140  *      addresses, because every properly looped back packet
2141  *      must have correct destination already attached by output routine.
2142  *
2143  *      Such approach solves two big problems:
2144  *      1. Not simplex devices are handled properly.
2145  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2146  *      called with rcu_read_lock()
2147  */
2148
2149 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2150                                u8 tos, struct net_device *dev)
2151 {
2152         struct fib_result res;
2153         struct in_device *in_dev = __in_dev_get_rcu(dev);
2154         struct flowi4   fl4;
2155         unsigned int    flags = 0;
2156         u32             itag = 0;
2157         struct rtable   *rth;
2158         unsigned int    hash;
2159         int             err = -EINVAL;
2160         struct net    *net = dev_net(dev);
2161
2162         /* IP on this device is disabled. */
2163
2164         if (!in_dev)
2165                 goto out;
2166
2167         /* Check for the most weird martians, which can be not detected
2168            by fib_lookup.
2169          */
2170
2171         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2172                 goto martian_source;
2173
2174         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2175                 goto brd_input;
2176
2177         /* Accept zero addresses only to limited broadcast;
2178          * I even do not know to fix it or not. Waiting for complains :-)
2179          */
2180         if (ipv4_is_zeronet(saddr))
2181                 goto martian_source;
2182
2183         if (ipv4_is_zeronet(daddr))
2184                 goto martian_destination;
2185
2186         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2187                 if (ipv4_is_loopback(daddr))
2188                         goto martian_destination;
2189
2190                 if (ipv4_is_loopback(saddr))
2191                         goto martian_source;
2192         }
2193
2194         /*
2195          *      Now we are ready to route packet.
2196          */
2197         fl4.flowi4_oif = 0;
2198         fl4.flowi4_iif = dev->ifindex;
2199         fl4.flowi4_mark = skb->mark;
2200         fl4.flowi4_tos = tos;
2201         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2202         fl4.daddr = daddr;
2203         fl4.saddr = saddr;
2204         err = fib_lookup(net, &fl4, &res);
2205         if (err != 0)
2206                 goto no_route;
2207
2208         RT_CACHE_STAT_INC(in_slow_tot);
2209
2210         if (res.type == RTN_BROADCAST)
2211                 goto brd_input;
2212
2213         if (res.type == RTN_LOCAL) {
2214                 err = fib_validate_source(skb, saddr, daddr, tos,
2215                                           net->loopback_dev->ifindex,
2216                                           dev, in_dev, &itag);
2217                 if (err < 0)
2218                         goto martian_source_keep_err;
2219                 if (err)
2220                         flags |= RTCF_DIRECTSRC;
2221                 goto local_input;
2222         }
2223
2224         if (!IN_DEV_FORWARD(in_dev))
2225                 goto no_route;
2226         if (res.type != RTN_UNICAST)
2227                 goto martian_destination;
2228
2229         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2230 out:    return err;
2231
2232 brd_input:
2233         if (skb->protocol != htons(ETH_P_IP))
2234                 goto e_inval;
2235
2236         if (!ipv4_is_zeronet(saddr)) {
2237                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2238                                           in_dev, &itag);
2239                 if (err < 0)
2240                         goto martian_source_keep_err;
2241                 if (err)
2242                         flags |= RTCF_DIRECTSRC;
2243         }
2244         flags |= RTCF_BROADCAST;
2245         res.type = RTN_BROADCAST;
2246         RT_CACHE_STAT_INC(in_brd);
2247
2248 local_input:
2249         rth = rt_dst_alloc(net->loopback_dev,
2250                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2251         if (!rth)
2252                 goto e_nobufs;
2253
2254         rth->dst.input= ip_local_deliver;
2255         rth->dst.output= ip_rt_bug;
2256 #ifdef CONFIG_IP_ROUTE_CLASSID
2257         rth->dst.tclassid = itag;
2258 #endif
2259
2260         rth->rt_key_dst = daddr;
2261         rth->rt_key_src = saddr;
2262         rth->rt_genid = rt_genid(net);
2263         rth->rt_flags   = flags|RTCF_LOCAL;
2264         rth->rt_type    = res.type;
2265         rth->rt_key_tos = tos;
2266         rth->rt_dst     = daddr;
2267         rth->rt_src     = saddr;
2268         rth->rt_route_iif = dev->ifindex;
2269         rth->rt_iif     = dev->ifindex;
2270         rth->rt_oif     = 0;
2271         rth->rt_mark    = skb->mark;
2272         rth->rt_gateway = daddr;
2273         rth->rt_peer_genid = 0;
2274         rt_init_peer(rth, net->ipv4.peers);
2275         rth->fi = NULL;
2276         if (res.type == RTN_UNREACHABLE) {
2277                 rth->dst.input= ip_error;
2278                 rth->dst.error= -err;
2279                 rth->rt_flags   &= ~RTCF_LOCAL;
2280         }
2281         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2282         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2283         err = 0;
2284         if (IS_ERR(rth))
2285                 err = PTR_ERR(rth);
2286         goto out;
2287
2288 no_route:
2289         RT_CACHE_STAT_INC(in_no_route);
2290         res.type = RTN_UNREACHABLE;
2291         if (err == -ESRCH)
2292                 err = -ENETUNREACH;
2293         goto local_input;
2294
2295         /*
2296          *      Do not cache martian addresses: they should be logged (RFC1812)
2297          */
2298 martian_destination:
2299         RT_CACHE_STAT_INC(in_martian_dst);
2300 #ifdef CONFIG_IP_ROUTE_VERBOSE
2301         if (IN_DEV_LOG_MARTIANS(in_dev))
2302                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2303                                      &daddr, &saddr, dev->name);
2304 #endif
2305
2306 e_inval:
2307         err = -EINVAL;
2308         goto out;
2309
2310 e_nobufs:
2311         err = -ENOBUFS;
2312         goto out;
2313
2314 martian_source:
2315         err = -EINVAL;
2316 martian_source_keep_err:
2317         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2318         goto out;
2319 }
2320
2321 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2322                            u8 tos, struct net_device *dev, bool noref)
2323 {
2324         struct rtable   *rth;
2325         unsigned int    hash;
2326         int iif = dev->ifindex;
2327         struct net *net;
2328         int res;
2329
2330         net = dev_net(dev);
2331
2332         rcu_read_lock();
2333
2334         if (!rt_caching(net))
2335                 goto skip_cache;
2336
2337         tos &= IPTOS_RT_MASK;
2338         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2339
2340         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2341              rth = rcu_dereference(rth->dst.rt_next)) {
2342                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2343                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2344                      (rth->rt_route_iif ^ iif) |
2345                      (rth->rt_key_tos ^ tos)) == 0 &&
2346                     rth->rt_mark == skb->mark &&
2347                     net_eq(dev_net(rth->dst.dev), net) &&
2348                     !rt_is_expired(rth)) {
2349                         ipv4_validate_peer(rth);
2350                         if (noref) {
2351                                 dst_use_noref(&rth->dst, jiffies);
2352                                 skb_dst_set_noref(skb, &rth->dst);
2353                         } else {
2354                                 dst_use(&rth->dst, jiffies);
2355                                 skb_dst_set(skb, &rth->dst);
2356                         }
2357                         RT_CACHE_STAT_INC(in_hit);
2358                         rcu_read_unlock();
2359                         return 0;
2360                 }
2361                 RT_CACHE_STAT_INC(in_hlist_search);
2362         }
2363
2364 skip_cache:
2365         /* Multicast recognition logic is moved from route cache to here.
2366            The problem was that too many Ethernet cards have broken/missing
2367            hardware multicast filters :-( As result the host on multicasting
2368            network acquires a lot of useless route cache entries, sort of
2369            SDR messages from all the world. Now we try to get rid of them.
2370            Really, provided software IP multicast filter is organized
2371            reasonably (at least, hashed), it does not result in a slowdown
2372            comparing with route cache reject entries.
2373            Note, that multicast routers are not affected, because
2374            route cache entry is created eventually.
2375          */
2376         if (ipv4_is_multicast(daddr)) {
2377                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2378
2379                 if (in_dev) {
2380                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2381                                                   ip_hdr(skb)->protocol);
2382                         if (our
2383 #ifdef CONFIG_IP_MROUTE
2384                                 ||
2385                             (!ipv4_is_local_multicast(daddr) &&
2386                              IN_DEV_MFORWARD(in_dev))
2387 #endif
2388                            ) {
2389                                 int res = ip_route_input_mc(skb, daddr, saddr,
2390                                                             tos, dev, our);
2391                                 rcu_read_unlock();
2392                                 return res;
2393                         }
2394                 }
2395                 rcu_read_unlock();
2396                 return -EINVAL;
2397         }
2398         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2399         rcu_read_unlock();
2400         return res;
2401 }
2402 EXPORT_SYMBOL(ip_route_input_common);
2403
2404 /* called with rcu_read_lock() */
2405 static struct rtable *__mkroute_output(const struct fib_result *res,
2406                                        const struct flowi4 *fl4,
2407                                        __be32 orig_daddr, __be32 orig_saddr,
2408                                        int orig_oif, __u8 orig_rtos,
2409                                        struct net_device *dev_out,
2410                                        unsigned int flags)
2411 {
2412         struct fib_info *fi = res->fi;
2413         struct in_device *in_dev;
2414         u16 type = res->type;
2415         struct rtable *rth;
2416
2417         in_dev = __in_dev_get_rcu(dev_out);
2418         if (!in_dev)
2419                 return ERR_PTR(-EINVAL);
2420
2421         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2422                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2423                         return ERR_PTR(-EINVAL);
2424
2425         if (ipv4_is_lbcast(fl4->daddr))
2426                 type = RTN_BROADCAST;
2427         else if (ipv4_is_multicast(fl4->daddr))
2428                 type = RTN_MULTICAST;
2429         else if (ipv4_is_zeronet(fl4->daddr))
2430                 return ERR_PTR(-EINVAL);
2431
2432         if (dev_out->flags & IFF_LOOPBACK)
2433                 flags |= RTCF_LOCAL;
2434
2435         if (type == RTN_BROADCAST) {
2436                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2437                 fi = NULL;
2438         } else if (type == RTN_MULTICAST) {
2439                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2440                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2441                                      fl4->flowi4_proto))
2442                         flags &= ~RTCF_LOCAL;
2443                 /* If multicast route do not exist use
2444                  * default one, but do not gateway in this case.
2445                  * Yes, it is hack.
2446                  */
2447                 if (fi && res->prefixlen < 4)
2448                         fi = NULL;
2449         }
2450
2451         rth = rt_dst_alloc(dev_out,
2452                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2453                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2454         if (!rth)
2455                 return ERR_PTR(-ENOBUFS);
2456
2457         rth->dst.output = ip_output;
2458
2459         rth->rt_key_dst = orig_daddr;
2460         rth->rt_key_src = orig_saddr;
2461         rth->rt_genid = rt_genid(dev_net(dev_out));
2462         rth->rt_flags   = flags;
2463         rth->rt_type    = type;
2464         rth->rt_key_tos = orig_rtos;
2465         rth->rt_dst     = fl4->daddr;
2466         rth->rt_src     = fl4->saddr;
2467         rth->rt_route_iif = 0;
2468         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2469         rth->rt_oif     = orig_oif;
2470         rth->rt_mark    = fl4->flowi4_mark;
2471         rth->rt_gateway = fl4->daddr;
2472         rth->rt_peer_genid = 0;
2473         rt_init_peer(rth, (res->table ?
2474                            &res->table->tb_peers :
2475                            dev_net(dev_out)->ipv4.peers));
2476         rth->fi = NULL;
2477
2478         RT_CACHE_STAT_INC(out_slow_tot);
2479
2480         if (flags & RTCF_LOCAL)
2481                 rth->dst.input = ip_local_deliver;
2482         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2483                 if (flags & RTCF_LOCAL &&
2484                     !(dev_out->flags & IFF_LOOPBACK)) {
2485                         rth->dst.output = ip_mc_output;
2486                         RT_CACHE_STAT_INC(out_slow_mc);
2487                 }
2488 #ifdef CONFIG_IP_MROUTE
2489                 if (type == RTN_MULTICAST) {
2490                         if (IN_DEV_MFORWARD(in_dev) &&
2491                             !ipv4_is_local_multicast(fl4->daddr)) {
2492                                 rth->dst.input = ip_mr_input;
2493                                 rth->dst.output = ip_mc_output;
2494                         }
2495                 }
2496 #endif
2497         }
2498
2499         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2500
2501         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2502                 rth->dst.flags |= DST_NOCACHE;
2503
2504         return rth;
2505 }
2506
2507 /*
2508  * Major route resolver routine.
2509  * called with rcu_read_lock();
2510  */
2511
2512 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2513 {
2514         struct net_device *dev_out = NULL;
2515         __u8 tos = RT_FL_TOS(fl4);
2516         unsigned int flags = 0;
2517         struct fib_result res;
2518         struct rtable *rth;
2519         __be32 orig_daddr;
2520         __be32 orig_saddr;
2521         int orig_oif;
2522
2523         res.fi          = NULL;
2524         res.table       = NULL;
2525 #ifdef CONFIG_IP_MULTIPLE_TABLES
2526         res.r           = NULL;
2527 #endif
2528
2529         orig_daddr = fl4->daddr;
2530         orig_saddr = fl4->saddr;
2531         orig_oif = fl4->flowi4_oif;
2532
2533         fl4->flowi4_iif = net->loopback_dev->ifindex;
2534         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2535         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2536                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2537
2538         rcu_read_lock();
2539         if (fl4->saddr) {
2540                 rth = ERR_PTR(-EINVAL);
2541                 if (ipv4_is_multicast(fl4->saddr) ||
2542                     ipv4_is_lbcast(fl4->saddr) ||
2543                     ipv4_is_zeronet(fl4->saddr))
2544                         goto out;
2545
2546                 /* I removed check for oif == dev_out->oif here.
2547                    It was wrong for two reasons:
2548                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2549                       is assigned to multiple interfaces.
2550                    2. Moreover, we are allowed to send packets with saddr
2551                       of another iface. --ANK
2552                  */
2553
2554                 if (fl4->flowi4_oif == 0 &&
2555                     (ipv4_is_multicast(fl4->daddr) ||
2556                      ipv4_is_lbcast(fl4->daddr))) {
2557                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2558                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2559                         if (dev_out == NULL)
2560                                 goto out;
2561
2562                         /* Special hack: user can direct multicasts
2563                            and limited broadcast via necessary interface
2564                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2565                            This hack is not just for fun, it allows
2566                            vic,vat and friends to work.
2567                            They bind socket to loopback, set ttl to zero
2568                            and expect that it will work.
2569                            From the viewpoint of routing cache they are broken,
2570                            because we are not allowed to build multicast path
2571                            with loopback source addr (look, routing cache
2572                            cannot know, that ttl is zero, so that packet
2573                            will not leave this host and route is valid).
2574                            Luckily, this hack is good workaround.
2575                          */
2576
2577                         fl4->flowi4_oif = dev_out->ifindex;
2578                         goto make_route;
2579                 }
2580
2581                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2582                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2583                         if (!__ip_dev_find(net, fl4->saddr, false))
2584                                 goto out;
2585                 }
2586         }
2587
2588
2589         if (fl4->flowi4_oif) {
2590                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2591                 rth = ERR_PTR(-ENODEV);
2592                 if (dev_out == NULL)
2593                         goto out;
2594
2595                 /* RACE: Check return value of inet_select_addr instead. */
2596                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2597                         rth = ERR_PTR(-ENETUNREACH);
2598                         goto out;
2599                 }
2600                 if (ipv4_is_local_multicast(fl4->daddr) ||
2601                     ipv4_is_lbcast(fl4->daddr)) {
2602                         if (!fl4->saddr)
2603                                 fl4->saddr = inet_select_addr(dev_out, 0,
2604                                                               RT_SCOPE_LINK);
2605                         goto make_route;
2606                 }
2607                 if (fl4->saddr) {
2608                         if (ipv4_is_multicast(fl4->daddr))
2609                                 fl4->saddr = inet_select_addr(dev_out, 0,
2610                                                               fl4->flowi4_scope);
2611                         else if (!fl4->daddr)
2612                                 fl4->saddr = inet_select_addr(dev_out, 0,
2613                                                               RT_SCOPE_HOST);
2614                 }
2615         }
2616
2617         if (!fl4->daddr) {
2618                 fl4->daddr = fl4->saddr;
2619                 if (!fl4->daddr)
2620                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2621                 dev_out = net->loopback_dev;
2622                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2623                 res.type = RTN_LOCAL;
2624                 flags |= RTCF_LOCAL;
2625                 goto make_route;
2626         }
2627
2628         if (fib_lookup(net, fl4, &res)) {
2629                 res.fi = NULL;
2630                 res.table = NULL;
2631                 if (fl4->flowi4_oif) {
2632                         /* Apparently, routing tables are wrong. Assume,
2633                            that the destination is on link.
2634
2635                            WHY? DW.
2636                            Because we are allowed to send to iface
2637                            even if it has NO routes and NO assigned
2638                            addresses. When oif is specified, routing
2639                            tables are looked up with only one purpose:
2640                            to catch if destination is gatewayed, rather than
2641                            direct. Moreover, if MSG_DONTROUTE is set,
2642                            we send packet, ignoring both routing tables
2643                            and ifaddr state. --ANK
2644
2645
2646                            We could make it even if oif is unknown,
2647                            likely IPv6, but we do not.
2648                          */
2649
2650                         if (fl4->saddr == 0)
2651                                 fl4->saddr = inet_select_addr(dev_out, 0,
2652                                                               RT_SCOPE_LINK);
2653                         res.type = RTN_UNICAST;
2654                         goto make_route;
2655                 }
2656                 rth = ERR_PTR(-ENETUNREACH);
2657                 goto out;
2658         }
2659
2660         if (res.type == RTN_LOCAL) {
2661                 if (!fl4->saddr) {
2662                         if (res.fi->fib_prefsrc)
2663                                 fl4->saddr = res.fi->fib_prefsrc;
2664                         else
2665                                 fl4->saddr = fl4->daddr;
2666                 }
2667                 dev_out = net->loopback_dev;
2668                 fl4->flowi4_oif = dev_out->ifindex;
2669                 res.fi = NULL;
2670                 flags |= RTCF_LOCAL;
2671                 goto make_route;
2672         }
2673
2674 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2675         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2676                 fib_select_multipath(&res);
2677         else
2678 #endif
2679         if (!res.prefixlen &&
2680             res.table->tb_num_default > 1 &&
2681             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2682                 fib_select_default(&res);
2683
2684         if (!fl4->saddr)
2685                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2686
2687         dev_out = FIB_RES_DEV(res);
2688         fl4->flowi4_oif = dev_out->ifindex;
2689
2690
2691 make_route:
2692         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2693                                tos, dev_out, flags);
2694         if (!IS_ERR(rth)) {
2695                 unsigned int hash;
2696
2697                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2698                                rt_genid(dev_net(dev_out)));
2699                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2700         }
2701
2702 out:
2703         rcu_read_unlock();
2704         return rth;
2705 }
2706
2707 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2708 {
2709         struct rtable *rth;
2710         unsigned int hash;
2711
2712         if (!rt_caching(net))
2713                 goto slow_output;
2714
2715         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2716
2717         rcu_read_lock_bh();
2718         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2719                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2720                 if (rth->rt_key_dst == flp4->daddr &&
2721                     rth->rt_key_src == flp4->saddr &&
2722                     rt_is_output_route(rth) &&
2723                     rth->rt_oif == flp4->flowi4_oif &&
2724                     rth->rt_mark == flp4->flowi4_mark &&
2725                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2726                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2727                     net_eq(dev_net(rth->dst.dev), net) &&
2728                     !rt_is_expired(rth)) {
2729                         ipv4_validate_peer(rth);
2730                         dst_use(&rth->dst, jiffies);
2731                         RT_CACHE_STAT_INC(out_hit);
2732                         rcu_read_unlock_bh();
2733                         if (!flp4->saddr)
2734                                 flp4->saddr = rth->rt_src;
2735                         if (!flp4->daddr)
2736                                 flp4->daddr = rth->rt_dst;
2737                         return rth;
2738                 }
2739                 RT_CACHE_STAT_INC(out_hlist_search);
2740         }
2741         rcu_read_unlock_bh();
2742
2743 slow_output:
2744         return ip_route_output_slow(net, flp4);
2745 }
2746 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2747
2748 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2749 {
2750         return NULL;
2751 }
2752
2753 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2754 {
2755         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2756
2757         return mtu ? : dst->dev->mtu;
2758 }
2759
2760 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2761 {
2762 }
2763
2764 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2765                                           unsigned long old)
2766 {
2767         return NULL;
2768 }
2769
2770 static struct dst_ops ipv4_dst_blackhole_ops = {
2771         .family                 =       AF_INET,
2772         .protocol               =       cpu_to_be16(ETH_P_IP),
2773         .destroy                =       ipv4_dst_destroy,
2774         .check                  =       ipv4_blackhole_dst_check,
2775         .mtu                    =       ipv4_blackhole_mtu,
2776         .default_advmss         =       ipv4_default_advmss,
2777         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2778         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2779         .neigh_lookup           =       ipv4_neigh_lookup,
2780 };
2781
2782 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2783 {
2784         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2785         struct rtable *ort = (struct rtable *) dst_orig;
2786
2787         if (rt) {
2788                 struct dst_entry *new = &rt->dst;
2789
2790                 new->__use = 1;
2791                 new->input = dst_discard;
2792                 new->output = dst_discard;
2793                 dst_copy_metrics(new, &ort->dst);
2794
2795                 new->dev = ort->dst.dev;
2796                 if (new->dev)
2797                         dev_hold(new->dev);
2798
2799                 rt->rt_key_dst = ort->rt_key_dst;
2800                 rt->rt_key_src = ort->rt_key_src;
2801                 rt->rt_key_tos = ort->rt_key_tos;
2802                 rt->rt_route_iif = ort->rt_route_iif;
2803                 rt->rt_iif = ort->rt_iif;
2804                 rt->rt_oif = ort->rt_oif;
2805                 rt->rt_mark = ort->rt_mark;
2806
2807                 rt->rt_genid = rt_genid(net);
2808                 rt->rt_flags = ort->rt_flags;
2809                 rt->rt_type = ort->rt_type;
2810                 rt->rt_dst = ort->rt_dst;
2811                 rt->rt_src = ort->rt_src;
2812                 rt->rt_gateway = ort->rt_gateway;
2813                 rt_transfer_peer(rt, ort);
2814                 rt->fi = ort->fi;
2815                 if (rt->fi)
2816                         atomic_inc(&rt->fi->fib_clntref);
2817
2818                 dst_free(new);
2819         }
2820
2821         dst_release(dst_orig);
2822
2823         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2824 }
2825
2826 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2827                                     struct sock *sk)
2828 {
2829         struct rtable *rt = __ip_route_output_key(net, flp4);
2830
2831         if (IS_ERR(rt))
2832                 return rt;
2833
2834         if (flp4->flowi4_proto)
2835                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2836                                                    flowi4_to_flowi(flp4),
2837                                                    sk, 0);
2838
2839         return rt;
2840 }
2841 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2842
2843 static int rt_fill_info(struct net *net,
2844                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2845                         int nowait, unsigned int flags)
2846 {
2847         struct rtable *rt = skb_rtable(skb);
2848         struct rtmsg *r;
2849         struct nlmsghdr *nlh;
2850         unsigned long expires = 0;
2851         u32 id = 0, ts = 0, tsage = 0, error;
2852
2853         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2854         if (nlh == NULL)
2855                 return -EMSGSIZE;
2856
2857         r = nlmsg_data(nlh);
2858         r->rtm_family    = AF_INET;
2859         r->rtm_dst_len  = 32;
2860         r->rtm_src_len  = 0;
2861         r->rtm_tos      = rt->rt_key_tos;
2862         r->rtm_table    = RT_TABLE_MAIN;
2863         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2864                 goto nla_put_failure;
2865         r->rtm_type     = rt->rt_type;
2866         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2867         r->rtm_protocol = RTPROT_UNSPEC;
2868         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2869         if (rt->rt_flags & RTCF_NOTIFY)
2870                 r->rtm_flags |= RTM_F_NOTIFY;
2871
2872         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2873                 goto nla_put_failure;
2874         if (rt->rt_key_src) {
2875                 r->rtm_src_len = 32;
2876                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2877                         goto nla_put_failure;
2878         }
2879         if (rt->dst.dev &&
2880             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2881                 goto nla_put_failure;
2882 #ifdef CONFIG_IP_ROUTE_CLASSID
2883         if (rt->dst.tclassid &&
2884             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2885                 goto nla_put_failure;
2886 #endif
2887         if (!rt_is_input_route(rt) &&
2888             rt->rt_src != rt->rt_key_src) {
2889                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2890                         goto nla_put_failure;
2891         }
2892         if (rt->rt_dst != rt->rt_gateway &&
2893             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2894                 goto nla_put_failure;
2895
2896         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2897                 goto nla_put_failure;
2898
2899         if (rt->rt_mark &&
2900             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2901                 goto nla_put_failure;
2902
2903         error = rt->dst.error;
2904         if (rt_has_peer(rt)) {
2905                 const struct inet_peer *peer = rt_peer_ptr(rt);
2906                 inet_peer_refcheck(peer);
2907                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2908                 if (peer->tcp_ts_stamp) {
2909                         ts = peer->tcp_ts;
2910                         tsage = get_seconds() - peer->tcp_ts_stamp;
2911                 }
2912                 expires = ACCESS_ONCE(peer->pmtu_expires);
2913                 if (expires) {
2914                         if (time_before(jiffies, expires))
2915                                 expires -= jiffies;
2916                         else
2917                                 expires = 0;
2918                 }
2919         }
2920
2921         if (rt_is_input_route(rt)) {
2922 #ifdef CONFIG_IP_MROUTE
2923                 __be32 dst = rt->rt_dst;
2924
2925                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2926                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2927                         int err = ipmr_get_route(net, skb,
2928                                                  rt->rt_src, rt->rt_dst,
2929                                                  r, nowait);
2930                         if (err <= 0) {
2931                                 if (!nowait) {
2932                                         if (err == 0)
2933                                                 return 0;
2934                                         goto nla_put_failure;
2935                                 } else {
2936                                         if (err == -EMSGSIZE)
2937                                                 goto nla_put_failure;
2938                                         error = err;
2939                                 }
2940                         }
2941                 } else
2942 #endif
2943                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2944                                 goto nla_put_failure;
2945         }
2946
2947         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2948                                expires, error) < 0)
2949                 goto nla_put_failure;
2950
2951         return nlmsg_end(skb, nlh);
2952
2953 nla_put_failure:
2954         nlmsg_cancel(skb, nlh);
2955         return -EMSGSIZE;
2956 }
2957
2958 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2959 {
2960         struct net *net = sock_net(in_skb->sk);
2961         struct rtmsg *rtm;
2962         struct nlattr *tb[RTA_MAX+1];
2963         struct rtable *rt = NULL;
2964         __be32 dst = 0;
2965         __be32 src = 0;
2966         u32 iif;
2967         int err;
2968         int mark;
2969         struct sk_buff *skb;
2970
2971         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2972         if (err < 0)
2973                 goto errout;
2974
2975         rtm = nlmsg_data(nlh);
2976
2977         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2978         if (skb == NULL) {
2979                 err = -ENOBUFS;
2980                 goto errout;
2981         }
2982
2983         /* Reserve room for dummy headers, this skb can pass
2984            through good chunk of routing engine.
2985          */
2986         skb_reset_mac_header(skb);
2987         skb_reset_network_header(skb);
2988
2989         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2990         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2991         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2992
2993         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2994         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2995         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2996         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2997
2998         if (iif) {
2999                 struct net_device *dev;
3000
3001                 dev = __dev_get_by_index(net, iif);
3002                 if (dev == NULL) {
3003                         err = -ENODEV;
3004                         goto errout_free;
3005                 }
3006
3007                 skb->protocol   = htons(ETH_P_IP);
3008                 skb->dev        = dev;
3009                 skb->mark       = mark;
3010                 local_bh_disable();
3011                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3012                 local_bh_enable();
3013
3014                 rt = skb_rtable(skb);
3015                 if (err == 0 && rt->dst.error)
3016                         err = -rt->dst.error;
3017         } else {
3018                 struct flowi4 fl4 = {
3019                         .daddr = dst,
3020                         .saddr = src,
3021                         .flowi4_tos = rtm->rtm_tos,
3022                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3023                         .flowi4_mark = mark,
3024                 };
3025                 rt = ip_route_output_key(net, &fl4);
3026
3027                 err = 0;
3028                 if (IS_ERR(rt))
3029                         err = PTR_ERR(rt);
3030         }
3031
3032         if (err)
3033                 goto errout_free;
3034
3035         skb_dst_set(skb, &rt->dst);
3036         if (rtm->rtm_flags & RTM_F_NOTIFY)
3037                 rt->rt_flags |= RTCF_NOTIFY;
3038
3039         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3040                            RTM_NEWROUTE, 0, 0);
3041         if (err <= 0)
3042                 goto errout_free;
3043
3044         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3045 errout:
3046         return err;
3047
3048 errout_free:
3049         kfree_skb(skb);
3050         goto errout;
3051 }
3052
3053 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3054 {
3055         struct rtable *rt;
3056         int h, s_h;
3057         int idx, s_idx;
3058         struct net *net;
3059
3060         net = sock_net(skb->sk);
3061
3062         s_h = cb->args[0];
3063         if (s_h < 0)
3064                 s_h = 0;
3065         s_idx = idx = cb->args[1];
3066         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3067                 if (!rt_hash_table[h].chain)
3068                         continue;
3069                 rcu_read_lock_bh();
3070                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3071                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3072                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3073                                 continue;
3074                         if (rt_is_expired(rt))
3075                                 continue;
3076                         skb_dst_set_noref(skb, &rt->dst);
3077                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3078                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3079                                          1, NLM_F_MULTI) <= 0) {
3080                                 skb_dst_drop(skb);
3081                                 rcu_read_unlock_bh();
3082                                 goto done;
3083                         }
3084                         skb_dst_drop(skb);
3085                 }
3086                 rcu_read_unlock_bh();
3087         }
3088
3089 done:
3090         cb->args[0] = h;
3091         cb->args[1] = idx;
3092         return skb->len;
3093 }
3094
3095 void ip_rt_multicast_event(struct in_device *in_dev)
3096 {
3097         rt_cache_flush(dev_net(in_dev->dev), 0);
3098 }
3099
3100 #ifdef CONFIG_SYSCTL
3101 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3102                                         void __user *buffer,
3103                                         size_t *lenp, loff_t *ppos)
3104 {
3105         if (write) {
3106                 int flush_delay;
3107                 ctl_table ctl;
3108                 struct net *net;
3109
3110                 memcpy(&ctl, __ctl, sizeof(ctl));
3111                 ctl.data = &flush_delay;
3112                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3113
3114                 net = (struct net *)__ctl->extra1;
3115                 rt_cache_flush(net, flush_delay);
3116                 return 0;
3117         }
3118
3119         return -EINVAL;
3120 }
3121
3122 static ctl_table ipv4_route_table[] = {
3123         {
3124                 .procname       = "gc_thresh",
3125                 .data           = &ipv4_dst_ops.gc_thresh,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec,
3129         },
3130         {
3131                 .procname       = "max_size",
3132                 .data           = &ip_rt_max_size,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec,
3136         },
3137         {
3138                 /*  Deprecated. Use gc_min_interval_ms */
3139
3140                 .procname       = "gc_min_interval",
3141                 .data           = &ip_rt_gc_min_interval,
3142                 .maxlen         = sizeof(int),
3143                 .mode           = 0644,
3144                 .proc_handler   = proc_dointvec_jiffies,
3145         },
3146         {
3147                 .procname       = "gc_min_interval_ms",
3148                 .data           = &ip_rt_gc_min_interval,
3149                 .maxlen         = sizeof(int),
3150                 .mode           = 0644,
3151                 .proc_handler   = proc_dointvec_ms_jiffies,
3152         },
3153         {
3154                 .procname       = "gc_timeout",
3155                 .data           = &ip_rt_gc_timeout,
3156                 .maxlen         = sizeof(int),
3157                 .mode           = 0644,
3158                 .proc_handler   = proc_dointvec_jiffies,
3159         },
3160         {
3161                 .procname       = "gc_interval",
3162                 .data           = &ip_rt_gc_interval,
3163                 .maxlen         = sizeof(int),
3164                 .mode           = 0644,
3165                 .proc_handler   = proc_dointvec_jiffies,
3166         },
3167         {
3168                 .procname       = "redirect_load",
3169                 .data           = &ip_rt_redirect_load,
3170                 .maxlen         = sizeof(int),
3171                 .mode           = 0644,
3172                 .proc_handler   = proc_dointvec,
3173         },
3174         {
3175                 .procname       = "redirect_number",
3176                 .data           = &ip_rt_redirect_number,
3177                 .maxlen         = sizeof(int),
3178                 .mode           = 0644,
3179                 .proc_handler   = proc_dointvec,
3180         },
3181         {
3182                 .procname       = "redirect_silence",
3183                 .data           = &ip_rt_redirect_silence,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "error_cost",
3190                 .data           = &ip_rt_error_cost,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 .procname       = "error_burst",
3197                 .data           = &ip_rt_error_burst,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         {
3203                 .procname       = "gc_elasticity",
3204                 .data           = &ip_rt_gc_elasticity,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec,
3208         },
3209         {
3210                 .procname       = "mtu_expires",
3211                 .data           = &ip_rt_mtu_expires,
3212                 .maxlen         = sizeof(int),
3213                 .mode           = 0644,
3214                 .proc_handler   = proc_dointvec_jiffies,
3215         },
3216         {
3217                 .procname       = "min_pmtu",
3218                 .data           = &ip_rt_min_pmtu,
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0644,
3221                 .proc_handler   = proc_dointvec,
3222         },
3223         {
3224                 .procname       = "min_adv_mss",
3225                 .data           = &ip_rt_min_advmss,
3226                 .maxlen         = sizeof(int),
3227                 .mode           = 0644,
3228                 .proc_handler   = proc_dointvec,
3229         },
3230         { }
3231 };
3232
3233 static struct ctl_table ipv4_route_flush_table[] = {
3234         {
3235                 .procname       = "flush",
3236                 .maxlen         = sizeof(int),
3237                 .mode           = 0200,
3238                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3239         },
3240         { },
3241 };
3242
3243 static __net_init int sysctl_route_net_init(struct net *net)
3244 {
3245         struct ctl_table *tbl;
3246
3247         tbl = ipv4_route_flush_table;
3248         if (!net_eq(net, &init_net)) {
3249                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3250                 if (tbl == NULL)
3251                         goto err_dup;
3252         }
3253         tbl[0].extra1 = net;
3254
3255         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3256         if (net->ipv4.route_hdr == NULL)
3257                 goto err_reg;
3258         return 0;
3259
3260 err_reg:
3261         if (tbl != ipv4_route_flush_table)
3262                 kfree(tbl);
3263 err_dup:
3264         return -ENOMEM;
3265 }
3266
3267 static __net_exit void sysctl_route_net_exit(struct net *net)
3268 {
3269         struct ctl_table *tbl;
3270
3271         tbl = net->ipv4.route_hdr->ctl_table_arg;
3272         unregister_net_sysctl_table(net->ipv4.route_hdr);
3273         BUG_ON(tbl == ipv4_route_flush_table);
3274         kfree(tbl);
3275 }
3276
3277 static __net_initdata struct pernet_operations sysctl_route_ops = {
3278         .init = sysctl_route_net_init,
3279         .exit = sysctl_route_net_exit,
3280 };
3281 #endif
3282
3283 static __net_init int rt_genid_init(struct net *net)
3284 {
3285         get_random_bytes(&net->ipv4.rt_genid,
3286                          sizeof(net->ipv4.rt_genid));
3287         get_random_bytes(&net->ipv4.dev_addr_genid,
3288                          sizeof(net->ipv4.dev_addr_genid));
3289         return 0;
3290 }
3291
3292 static __net_initdata struct pernet_operations rt_genid_ops = {
3293         .init = rt_genid_init,
3294 };
3295
3296 static int __net_init ipv4_inetpeer_init(struct net *net)
3297 {
3298         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3299
3300         if (!bp)
3301                 return -ENOMEM;
3302         inet_peer_base_init(bp);
3303         net->ipv4.peers = bp;
3304         return 0;
3305 }
3306
3307 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3308 {
3309         struct inet_peer_base *bp = net->ipv4.peers;
3310
3311         net->ipv4.peers = NULL;
3312         inetpeer_invalidate_tree(bp);
3313         kfree(bp);
3314 }
3315
3316 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3317         .init   =       ipv4_inetpeer_init,
3318         .exit   =       ipv4_inetpeer_exit,
3319 };
3320
3321 #ifdef CONFIG_IP_ROUTE_CLASSID
3322 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3323 #endif /* CONFIG_IP_ROUTE_CLASSID */
3324
3325 static __initdata unsigned long rhash_entries;
3326 static int __init set_rhash_entries(char *str)
3327 {
3328         ssize_t ret;
3329
3330         if (!str)
3331                 return 0;
3332
3333         ret = kstrtoul(str, 0, &rhash_entries);
3334         if (ret)
3335                 return 0;
3336
3337         return 1;
3338 }
3339 __setup("rhash_entries=", set_rhash_entries);
3340
3341 int __init ip_rt_init(void)
3342 {
3343         int rc = 0;
3344
3345 #ifdef CONFIG_IP_ROUTE_CLASSID
3346         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3347         if (!ip_rt_acct)
3348                 panic("IP: failed to allocate ip_rt_acct\n");
3349 #endif
3350
3351         ipv4_dst_ops.kmem_cachep =
3352                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3353                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3354
3355         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3356
3357         if (dst_entries_init(&ipv4_dst_ops) < 0)
3358                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3359
3360         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3361                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3362
3363         rt_hash_table = (struct rt_hash_bucket *)
3364                 alloc_large_system_hash("IP route cache",
3365                                         sizeof(struct rt_hash_bucket),
3366                                         rhash_entries,
3367                                         (totalram_pages >= 128 * 1024) ?
3368                                         15 : 17,
3369                                         0,
3370                                         &rt_hash_log,
3371                                         &rt_hash_mask,
3372                                         0,
3373                                         rhash_entries ? 0 : 512 * 1024);
3374         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3375         rt_hash_lock_init();
3376
3377         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3378         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3379
3380         devinet_init();
3381         ip_fib_init();
3382
3383         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3384         expires_ljiffies = jiffies;
3385         schedule_delayed_work(&expires_work,
3386                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3387
3388         if (ip_rt_proc_init())
3389                 pr_err("Unable to create route proc files\n");
3390 #ifdef CONFIG_XFRM
3391         xfrm_init();
3392         xfrm4_init(ip_rt_max_size);
3393 #endif
3394         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3395
3396 #ifdef CONFIG_SYSCTL
3397         register_pernet_subsys(&sysctl_route_ops);
3398 #endif
3399         register_pernet_subsys(&rt_genid_ops);
3400         register_pernet_subsys(&ipv4_inetpeer_ops);
3401         return rc;
3402 }
3403
3404 #ifdef CONFIG_SYSCTL
3405 /*
3406  * We really need to sanitize the damn ipv4 init order, then all
3407  * this nonsense will go away.
3408  */
3409 void __init ip_static_sysctl_init(void)
3410 {
3411         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3412 }
3413 #endif