net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         WARN_ON(1);
 162         return NULL;
 163 }
 164
 165 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 166                                            struct sk_buff *skb,
 167                                            const void *daddr);
 168
 169 static struct dst_ops ipv4_dst_ops = {
 170         .family =               AF_INET,
 171         .protocol =             cpu_to_be16(ETH_P_IP),
 172         .gc =                   rt_garbage_collect,
 173         .check =                ipv4_dst_check,
 174         .default_advmss =       ipv4_default_advmss,
 175         .mtu =                  ipv4_mtu,
 176         .cow_metrics =          ipv4_cow_metrics,
 177         .destroy =              ipv4_dst_destroy,
 178         .ifdown =               ipv4_dst_ifdown,
 179         .negative_advice =      ipv4_negative_advice,
 180         .link_failure =         ipv4_link_failure,
 181         .update_pmtu =          ip_rt_update_pmtu,
 182         .local_out =            __ip_local_out,
 183         .neigh_lookup =         ipv4_neigh_lookup,
 184 };
 185
 186 #define ECN_OR_COST(class)      TC_PRIO_##class
 187
 188 const __u8 ip_tos2prio[16] = {
 189         TC_PRIO_BESTEFFORT,
 190         ECN_OR_COST(BESTEFFORT),
 191         TC_PRIO_BESTEFFORT,
 192         ECN_OR_COST(BESTEFFORT),
 193         TC_PRIO_BULK,
 194         ECN_OR_COST(BULK),
 195         TC_PRIO_BULK,
 196         ECN_OR_COST(BULK),
 197         TC_PRIO_INTERACTIVE,
 198         ECN_OR_COST(INTERACTIVE),
 199         TC_PRIO_INTERACTIVE,
 200         ECN_OR_COST(INTERACTIVE),
 201         TC_PRIO_INTERACTIVE_BULK,
 202         ECN_OR_COST(INTERACTIVE_BULK),
 203         TC_PRIO_INTERACTIVE_BULK,
 204         ECN_OR_COST(INTERACTIVE_BULK)
 205 };
 206 EXPORT_SYMBOL(ip_tos2prio);
 207
 208 /*
 209  * Route cache.
 210  */
 211
 212 /* The locking scheme is rather straight forward:
 213  *
 214  * 1) Read-Copy Update protects the buckets of the central route hash.
 215  * 2) Only writers remove entries, and they hold the lock
 216  *    as they look at rtable reference counts.
 217  * 3) Only readers acquire references to rtable entries,
 218  *    they do so with atomic increments and with the
 219  *    lock held.
 220  */
 221
 222 struct rt_hash_bucket {
 223         struct rtable __rcu     *chain;
 224 };
 225
 226 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 227         defined(CONFIG_PROVE_LOCKING)
 228 /*
 229  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 230  * The size of this table is a power of two and depends on the number of CPUS.
 231  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 232  */
 233 #ifdef CONFIG_LOCKDEP
 234 # define RT_HASH_LOCK_SZ        256
 235 #else
 236 # if NR_CPUS >= 32
 237 #  define RT_HASH_LOCK_SZ       4096
 238 # elif NR_CPUS >= 16
 239 #  define RT_HASH_LOCK_SZ       2048
 240 # elif NR_CPUS >= 8
 241 #  define RT_HASH_LOCK_SZ       1024
 242 # elif NR_CPUS >= 4
 243 #  define RT_HASH_LOCK_SZ       512
 244 # else
 245 #  define RT_HASH_LOCK_SZ       256
 246 # endif
 247 #endif
 248
 249 static spinlock_t       *rt_hash_locks;
 250 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 251
 252 static __init void rt_hash_lock_init(void)
 253 {
 254         int i;
 255
 256         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 257                         GFP_KERNEL);
 258         if (!rt_hash_locks)
 259                 panic("IP: failed to allocate rt_hash_locks\n");
 260
 261         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 262                 spin_lock_init(&rt_hash_locks[i]);
 263 }
 264 #else
 265 # define rt_hash_lock_addr(slot) NULL
 266
 267 static inline void rt_hash_lock_init(void)
 268 {
 269 }
 270 #endif
 271
 272 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 273 static unsigned int             rt_hash_mask __read_mostly;
 274 static unsigned int             rt_hash_log  __read_mostly;
 275
 276 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 277 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 278
 279 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 280                                    int genid)
 281 {
 282         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 283                             idx, genid)
 284                 & rt_hash_mask;
 285 }
 286
 287 static inline int rt_genid(struct net *net)
 288 {
 289         return atomic_read(&net->ipv4.rt_genid);
 290 }
 291
 292 #ifdef CONFIG_PROC_FS
 293 struct rt_cache_iter_state {
 294         struct seq_net_private p;
 295         int bucket;
 296         int genid;
 297 };
 298
 299 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 300 {
 301         struct rt_cache_iter_state *st = seq->private;
 302         struct rtable *r = NULL;
 303
 304         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 305                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 306                         continue;
 307                 rcu_read_lock_bh();
 308                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 309                 while (r) {
 310                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 311                             r->rt_genid == st->genid)
 312                                 return r;
 313                         r = rcu_dereference_bh(r->dst.rt_next);
 314                 }
 315                 rcu_read_unlock_bh();
 316         }
 317         return r;
 318 }
 319
 320 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 321                                           struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324
 325         r = rcu_dereference_bh(r->dst.rt_next);
 326         while (!r) {
 327                 rcu_read_unlock_bh();
 328                 do {
 329                         if (--st->bucket < 0)
 330                                 return NULL;
 331                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 332                 rcu_read_lock_bh();
 333                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 334         }
 335         return r;
 336 }
 337
 338 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 339                                         struct rtable *r)
 340 {
 341         struct rt_cache_iter_state *st = seq->private;
 342         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 343                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 344                         continue;
 345                 if (r->rt_genid == st->genid)
 346                         break;
 347         }
 348         return r;
 349 }
 350
 351 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 352 {
 353         struct rtable *r = rt_cache_get_first(seq);
 354
 355         if (r)
 356                 while (pos && (r = rt_cache_get_next(seq, r)))
 357                         --pos;
 358         return pos ? NULL : r;
 359 }
 360
 361 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         if (*pos)
 365                 return rt_cache_get_idx(seq, *pos - 1);
 366         st->genid = rt_genid(seq_file_net(seq));
 367         return SEQ_START_TOKEN;
 368 }
 369
 370 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 371 {
 372         struct rtable *r;
 373
 374         if (v == SEQ_START_TOKEN)
 375                 r = rt_cache_get_first(seq);
 376         else
 377                 r = rt_cache_get_next(seq, v);
 378         ++*pos;
 379         return r;
 380 }
 381
 382 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 383 {
 384         if (v && v != SEQ_START_TOKEN)
 385                 rcu_read_unlock_bh();
 386 }
 387
 388 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 389 {
 390         if (v == SEQ_START_TOKEN)
 391                 seq_printf(seq, "%-127s\n",
 392                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 393                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 394                            "HHUptod\tSpecDst");
 395         else {
 396                 struct rtable *r = v;
 397                 int len;
 398
 399                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 400                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 401                            r->dst.dev ? r->dst.dev->name : "*",
 402                            (__force u32)r->rt_dst,
 403                            (__force u32)r->rt_gateway,
 404                            r->rt_flags, atomic_read(&r->dst.__refcnt),
 405                            r->dst.__use, 0, (__force u32)r->rt_src,
 406                            dst_metric_advmss(&r->dst) + 40,
 407                            dst_metric(&r->dst, RTAX_WINDOW), 0,
 408                            r->rt_key_tos,
 409                            -1, 0, 0, &len);
 410
 411                 seq_printf(seq, "%*s\n", 127 - len, "");
 412         }
 413         return 0;
 414 }
 415
 416 static const struct seq_operations rt_cache_seq_ops = {
 417         .start  = rt_cache_seq_start,
 418         .next   = rt_cache_seq_next,
 419         .stop   = rt_cache_seq_stop,
 420         .show   = rt_cache_seq_show,
 421 };
 422
 423 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 424 {
 425         return seq_open_net(inode, file, &rt_cache_seq_ops,
 426                         sizeof(struct rt_cache_iter_state));
 427 }
 428
 429 static const struct file_operations rt_cache_seq_fops = {
 430         .owner   = THIS_MODULE,
 431         .open    = rt_cache_seq_open,
 432         .read    = seq_read,
 433         .llseek  = seq_lseek,
 434         .release = seq_release_net,
 435 };
 436
 437
 438 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 439 {
 440         int cpu;
 441
 442         if (*pos == 0)
 443                 return SEQ_START_TOKEN;
 444
 445         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452 }
 453
 454 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 455 {
 456         int cpu;
 457
 458         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 459                 if (!cpu_possible(cpu))
 460                         continue;
 461                 *pos = cpu+1;
 462                 return &per_cpu(rt_cache_stat, cpu);
 463         }
 464         return NULL;
 465
 466 }
 467
 468 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 469 {
 470
 471 }
 472
 473 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 474 {
 475         struct rt_cache_stat *st = v;
 476
 477         if (v == SEQ_START_TOKEN) {
 478                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 479                 return 0;
 480         }
 481
 482         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 483                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 484                    dst_entries_get_slow(&ipv4_dst_ops),
 485                    st->in_hit,
 486                    st->in_slow_tot,
 487                    st->in_slow_mc,
 488                    st->in_no_route,
 489                    st->in_brd,
 490                    st->in_martian_dst,
 491                    st->in_martian_src,
 492
 493                    st->out_hit,
 494                    st->out_slow_tot,
 495                    st->out_slow_mc,
 496
 497                    st->gc_total,
 498                    st->gc_ignored,
 499                    st->gc_goal_miss,
 500                    st->gc_dst_overflow,
 501                    st->in_hlist_search,
 502                    st->out_hlist_search
 503                 );
 504         return 0;
 505 }
 506
 507 static const struct seq_operations rt_cpu_seq_ops = {
 508         .start  = rt_cpu_seq_start,
 509         .next   = rt_cpu_seq_next,
 510         .stop   = rt_cpu_seq_stop,
 511         .show   = rt_cpu_seq_show,
 512 };
 513
 514
 515 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 516 {
 517         return seq_open(file, &rt_cpu_seq_ops);
 518 }
 519
 520 static const struct file_operations rt_cpu_seq_fops = {
 521         .owner   = THIS_MODULE,
 522         .open    = rt_cpu_seq_open,
 523         .read    = seq_read,
 524         .llseek  = seq_lseek,
 525         .release = seq_release,
 526 };
 527
 528 #ifdef CONFIG_IP_ROUTE_CLASSID
 529 static int rt_acct_proc_show(struct seq_file *m, void *v)
 530 {
 531         struct ip_rt_acct *dst, *src;
 532         unsigned int i, j;
 533
 534         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 535         if (!dst)
 536                 return -ENOMEM;
 537
 538         for_each_possible_cpu(i) {
 539                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 540                 for (j = 0; j < 256; j++) {
 541                         dst[j].o_bytes   += src[j].o_bytes;
 542                         dst[j].o_packets += src[j].o_packets;
 543                         dst[j].i_bytes   += src[j].i_bytes;
 544                         dst[j].i_packets += src[j].i_packets;
 545                 }
 546         }
 547
 548         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 549         kfree(dst);
 550         return 0;
 551 }
 552
 553 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 554 {
 555         return single_open(file, rt_acct_proc_show, NULL);
 556 }
 557
 558 static const struct file_operations rt_acct_proc_fops = {
 559         .owner          = THIS_MODULE,
 560         .open           = rt_acct_proc_open,
 561         .read           = seq_read,
 562         .llseek         = seq_lseek,
 563         .release        = single_release,
 564 };
 565 #endif
 566
 567 static int __net_init ip_rt_do_proc_init(struct net *net)
 568 {
 569         struct proc_dir_entry *pde;
 570
 571         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 572                         &rt_cache_seq_fops);
 573         if (!pde)
 574                 goto err1;
 575
 576         pde = proc_create("rt_cache", S_IRUGO,
 577                           net->proc_net_stat, &rt_cpu_seq_fops);
 578         if (!pde)
 579                 goto err2;
 580
 581 #ifdef CONFIG_IP_ROUTE_CLASSID
 582         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 583         if (!pde)
 584                 goto err3;
 585 #endif
 586         return 0;
 587
 588 #ifdef CONFIG_IP_ROUTE_CLASSID
 589 err3:
 590         remove_proc_entry("rt_cache", net->proc_net_stat);
 591 #endif
 592 err2:
 593         remove_proc_entry("rt_cache", net->proc_net);
 594 err1:
 595         return -ENOMEM;
 596 }
 597
 598 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 599 {
 600         remove_proc_entry("rt_cache", net->proc_net_stat);
 601         remove_proc_entry("rt_cache", net->proc_net);
 602 #ifdef CONFIG_IP_ROUTE_CLASSID
 603         remove_proc_entry("rt_acct", net->proc_net);
 604 #endif
 605 }
 606
 607 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 608         .init = ip_rt_do_proc_init,
 609         .exit = ip_rt_do_proc_exit,
 610 };
 611
 612 static int __init ip_rt_proc_init(void)
 613 {
 614         return register_pernet_subsys(&ip_rt_proc_ops);
 615 }
 616
 617 #else
 618 static inline int ip_rt_proc_init(void)
 619 {
 620         return 0;
 621 }
 622 #endif /* CONFIG_PROC_FS */
 623
 624 static inline void rt_free(struct rtable *rt)
 625 {
 626         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 627 }
 628
 629 static inline void rt_drop(struct rtable *rt)
 630 {
 631         ip_rt_put(rt);
 632         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 633 }
 634
 635 static inline int rt_fast_clean(struct rtable *rth)
 636 {
 637         /* Kill broadcast/multicast entries very aggresively, if they
 638            collide in hash table with more useful entries */
 639         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 640                 rt_is_input_route(rth) && rth->dst.rt_next;
 641 }
 642
 643 static inline int rt_valuable(struct rtable *rth)
 644 {
 645         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 646                 rth->dst.expires;
 647 }
 648
 649 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 650 {
 651         unsigned long age;
 652         int ret = 0;
 653
 654         if (atomic_read(&rth->dst.__refcnt))
 655                 goto out;
 656
 657         age = jiffies - rth->dst.lastuse;
 658         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 659             (age <= tmo2 && rt_valuable(rth)))
 660                 goto out;
 661         ret = 1;
 662 out:    return ret;
 663 }
 664
 665 /* Bits of score are:
 666  * 31: very valuable
 667  * 30: not quite useless
 668  * 29..0: usage counter
 669  */
 670 static inline u32 rt_score(struct rtable *rt)
 671 {
 672         u32 score = jiffies - rt->dst.lastuse;
 673
 674         score = ~score & ~(3<<30);
 675
 676         if (rt_valuable(rt))
 677                 score |= (1<<31);
 678
 679         if (rt_is_output_route(rt) ||
 680             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 681                 score |= (1<<30);
 682
 683         return score;
 684 }
 685
 686 static inline bool rt_caching(const struct net *net)
 687 {
 688         return net->ipv4.current_rt_cache_rebuild_count <=
 689                 net->ipv4.sysctl_rt_cache_rebuild_count;
 690 }
 691
 692 static inline bool compare_hash_inputs(const struct rtable *rt1,
 693                                        const struct rtable *rt2)
 694 {
 695         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 696                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 697                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 698 }
 699
 700 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 701 {
 702         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 703                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 704                 (rt1->rt_mark ^ rt2->rt_mark) |
 705                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 706                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 707                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 708 }
 709
 710 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 711 {
 712         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 713 }
 714
 715 static inline int rt_is_expired(struct rtable *rth)
 716 {
 717         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 718 }
 719
 720 /*
 721  * Perform a full scan of hash table and free all entries.
 722  * Can be called by a softirq or a process.
 723  * In the later case, we want to be reschedule if necessary
 724  */
 725 static void rt_do_flush(struct net *net, int process_context)
 726 {
 727         unsigned int i;
 728         struct rtable *rth, *next;
 729
 730         for (i = 0; i <= rt_hash_mask; i++) {
 731                 struct rtable __rcu **pprev;
 732                 struct rtable *list;
 733
 734                 if (process_context && need_resched())
 735                         cond_resched();
 736                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 737                 if (!rth)
 738                         continue;
 739
 740                 spin_lock_bh(rt_hash_lock_addr(i));
 741
 742                 list = NULL;
 743                 pprev = &rt_hash_table[i].chain;
 744                 rth = rcu_dereference_protected(*pprev,
 745                         lockdep_is_held(rt_hash_lock_addr(i)));
 746
 747                 while (rth) {
 748                         next = rcu_dereference_protected(rth->dst.rt_next,
 749                                 lockdep_is_held(rt_hash_lock_addr(i)));
 750
 751                         if (!net ||
 752                             net_eq(dev_net(rth->dst.dev), net)) {
 753                                 rcu_assign_pointer(*pprev, next);
 754                                 rcu_assign_pointer(rth->dst.rt_next, list);
 755                                 list = rth;
 756                         } else {
 757                                 pprev = &rth->dst.rt_next;
 758                         }
 759                         rth = next;
 760                 }
 761
 762                 spin_unlock_bh(rt_hash_lock_addr(i));
 763
 764                 for (; list; list = next) {
 765                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 766                         rt_free(list);
 767                 }
 768         }
 769 }
 770
 771 /*
 772  * While freeing expired entries, we compute average chain length
 773  * and standard deviation, using fixed-point arithmetic.
 774  * This to have an estimation of rt_chain_length_max
 775  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 776  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 777  */
 778
 779 #define FRACT_BITS 3
 780 #define ONE (1UL << FRACT_BITS)
 781
 782 /*
 783  * Given a hash chain and an item in this hash chain,
 784  * find if a previous entry has the same hash_inputs
 785  * (but differs on tos, mark or oif)
 786  * Returns 0 if an alias is found.
 787  * Returns ONE if rth has no alias before itself.
 788  */
 789 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 790 {
 791         const struct rtable *aux = head;
 792
 793         while (aux != rth) {
 794                 if (compare_hash_inputs(aux, rth))
 795                         return 0;
 796                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 797         }
 798         return ONE;
 799 }
 800
 801 static void rt_check_expire(void)
 802 {
 803         static unsigned int rover;
 804         unsigned int i = rover, goal;
 805         struct rtable *rth;
 806         struct rtable __rcu **rthp;
 807         unsigned long samples = 0;
 808         unsigned long sum = 0, sum2 = 0;
 809         unsigned long delta;
 810         u64 mult;
 811
 812         delta = jiffies - expires_ljiffies;
 813         expires_ljiffies = jiffies;
 814         mult = ((u64)delta) << rt_hash_log;
 815         if (ip_rt_gc_timeout > 1)
 816                 do_div(mult, ip_rt_gc_timeout);
 817         goal = (unsigned int)mult;
 818         if (goal > rt_hash_mask)
 819                 goal = rt_hash_mask + 1;
 820         for (; goal > 0; goal--) {
 821                 unsigned long tmo = ip_rt_gc_timeout;
 822                 unsigned long length;
 823
 824                 i = (i + 1) & rt_hash_mask;
 825                 rthp = &rt_hash_table[i].chain;
 826
 827                 if (need_resched())
 828                         cond_resched();
 829
 830                 samples++;
 831
 832                 if (rcu_dereference_raw(*rthp) == NULL)
 833                         continue;
 834                 length = 0;
 835                 spin_lock_bh(rt_hash_lock_addr(i));
 836                 while ((rth = rcu_dereference_protected(*rthp,
 837                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 838                         prefetch(rth->dst.rt_next);
 839                         if (rt_is_expired(rth) ||
 840                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 841                                 *rthp = rth->dst.rt_next;
 842                                 rt_free(rth);
 843                                 continue;
 844                         }
 845
 846                         /* We only count entries on a chain with equal
 847                          * hash inputs once so that entries for
 848                          * different QOS levels, and other non-hash
 849                          * input attributes don't unfairly skew the
 850                          * length computation
 851                          */
 852                         tmo >>= 1;
 853                         rthp = &rth->dst.rt_next;
 854                         length += has_noalias(rt_hash_table[i].chain, rth);
 855                 }
 856                 spin_unlock_bh(rt_hash_lock_addr(i));
 857                 sum += length;
 858                 sum2 += length*length;
 859         }
 860         if (samples) {
 861                 unsigned long avg = sum / samples;
 862                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 863                 rt_chain_length_max = max_t(unsigned long,
 864                                         ip_rt_gc_elasticity,
 865                                         (avg + 4*sd) >> FRACT_BITS);
 866         }
 867         rover = i;
 868 }
 869
 870 /*
 871  * rt_worker_func() is run in process context.
 872  * we call rt_check_expire() to scan part of the hash table
 873  */
 874 static void rt_worker_func(struct work_struct *work)
 875 {
 876         rt_check_expire();
 877         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 878 }
 879
 880 /*
 881  * Perturbation of rt_genid by a small quantity [1..256]
 882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 883  * many times (2^24) without giving recent rt_genid.
 884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 885  */
 886 static void rt_cache_invalidate(struct net *net)
 887 {
 888         unsigned char shuffle;
 889
 890         get_random_bytes(&shuffle, sizeof(shuffle));
 891         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 892 }
 893
 894 /*
 895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 896  * delay >= 0 : invalidate & flush cache (can be long)
 897  */
 898 void rt_cache_flush(struct net *net, int delay)
 899 {
 900         rt_cache_invalidate(net);
 901         if (delay >= 0)
 902                 rt_do_flush(net, !in_softirq());
 903 }
 904
 905 /* Flush previous cache invalidated entries from the cache */
 906 void rt_cache_flush_batch(struct net *net)
 907 {
 908         rt_do_flush(net, !in_softirq());
 909 }
 910
 911 static void rt_emergency_hash_rebuild(struct net *net)
 912 {
 913         net_warn_ratelimited("Route hash chain too long!\n");
 914         rt_cache_invalidate(net);
 915 }
 916
 917 /*
 918    Short description of GC goals.
 919
 920    We want to build algorithm, which will keep routing cache
 921    at some equilibrium point, when number of aged off entries
 922    is kept approximately equal to newly generated ones.
 923
 924    Current expiration strength is variable "expire".
 925    We try to adjust it dynamically, so that if networking
 926    is idle expires is large enough to keep enough of warm entries,
 927    and when load increases it reduces to limit cache size.
 928  */
 929
 930 static int rt_garbage_collect(struct dst_ops *ops)
 931 {
 932         static unsigned long expire = RT_GC_TIMEOUT;
 933         static unsigned long last_gc;
 934         static int rover;
 935         static int equilibrium;
 936         struct rtable *rth;
 937         struct rtable __rcu **rthp;
 938         unsigned long now = jiffies;
 939         int goal;
 940         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 941
 942         /*
 943          * Garbage collection is pretty expensive,
 944          * do not make it too frequently.
 945          */
 946
 947         RT_CACHE_STAT_INC(gc_total);
 948
 949         if (now - last_gc < ip_rt_gc_min_interval &&
 950             entries < ip_rt_max_size) {
 951                 RT_CACHE_STAT_INC(gc_ignored);
 952                 goto out;
 953         }
 954
 955         entries = dst_entries_get_slow(&ipv4_dst_ops);
 956         /* Calculate number of entries, which we want to expire now. */
 957         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 958         if (goal <= 0) {
 959                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 960                         equilibrium = ipv4_dst_ops.gc_thresh;
 961                 goal = entries - equilibrium;
 962                 if (goal > 0) {
 963                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 964                         goal = entries - equilibrium;
 965                 }
 966         } else {
 967                 /* We are in dangerous area. Try to reduce cache really
 968                  * aggressively.
 969                  */
 970                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 971                 equilibrium = entries - goal;
 972         }
 973
 974         if (now - last_gc >= ip_rt_gc_min_interval)
 975                 last_gc = now;
 976
 977         if (goal <= 0) {
 978                 equilibrium += goal;
 979                 goto work_done;
 980         }
 981
 982         do {
 983                 int i, k;
 984
 985                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 986                         unsigned long tmo = expire;
 987
 988                         k = (k + 1) & rt_hash_mask;
 989                         rthp = &rt_hash_table[k].chain;
 990                         spin_lock_bh(rt_hash_lock_addr(k));
 991                         while ((rth = rcu_dereference_protected(*rthp,
 992                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 993                                 if (!rt_is_expired(rth) &&
 994                                         !rt_may_expire(rth, tmo, expire)) {
 995                                         tmo >>= 1;
 996                                         rthp = &rth->dst.rt_next;
 997                                         continue;
 998                                 }
 999                                 *rthp = rth->dst.rt_next;
1000                                 rt_free(rth);
1001                                 goal--;
1002                         }
1003                         spin_unlock_bh(rt_hash_lock_addr(k));
1004                         if (goal <= 0)
1005                                 break;
1006                 }
1007                 rover = k;
1008
1009                 if (goal <= 0)
1010                         goto work_done;
1011
1012                 /* Goal is not achieved. We stop process if:
1013
1014                    - if expire reduced to zero. Otherwise, expire is halfed.
1015                    - if table is not full.
1016                    - if we are called from interrupt.
1017                    - jiffies check is just fallback/debug loop breaker.
1018                      We will not spin here for long time in any case.
1019                  */
1020
1021                 RT_CACHE_STAT_INC(gc_goal_miss);
1022
1023                 if (expire == 0)
1024                         break;
1025
1026                 expire >>= 1;
1027
1028                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1029                         goto out;
1030         } while (!in_softirq() && time_before_eq(jiffies, now));
1031
1032         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                 goto out;
1034         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1035                 goto out;
1036         net_warn_ratelimited("dst cache overflow\n");
1037         RT_CACHE_STAT_INC(gc_dst_overflow);
1038         return 1;
1039
1040 work_done:
1041         expire += ip_rt_gc_min_interval;
1042         if (expire > ip_rt_gc_timeout ||
1043             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1044             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1045                 expire = ip_rt_gc_timeout;
1046 out:    return 0;
1047 }
1048
1049 /*
1050  * Returns number of entries in a hash chain that have different hash_inputs
1051  */
1052 static int slow_chain_length(const struct rtable *head)
1053 {
1054         int length = 0;
1055         const struct rtable *rth = head;
1056
1057         while (rth) {
1058                 length += has_noalias(head, rth);
1059                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1060         }
1061         return length >> FRACT_BITS;
1062 }
1063
1064 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1065                                            struct sk_buff *skb,
1066                                            const void *daddr)
1067 {
1068         struct net_device *dev = dst->dev;
1069         const __be32 *pkey = daddr;
1070         const struct rtable *rt;
1071         struct neighbour *n;
1072
1073         rt = (const struct rtable *) dst;
1074         if (rt->rt_gateway)
1075                 pkey = (const __be32 *) &rt->rt_gateway;
1076         else if (skb)
1077                 pkey = &ip_hdr(skb)->daddr;
1078
1079         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1080         if (n)
1081                 return n;
1082         return neigh_create(&arp_tbl, pkey, dev);
1083 }
1084
1085 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1086                                      struct sk_buff *skb, int ifindex)
1087 {
1088         struct rtable   *rth, *cand;
1089         struct rtable __rcu **rthp, **candp;
1090         unsigned long   now;
1091         u32             min_score;
1092         int             chain_length;
1093
1094 restart:
1095         chain_length = 0;
1096         min_score = ~(u32)0;
1097         cand = NULL;
1098         candp = NULL;
1099         now = jiffies;
1100
1101         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1102                 /*
1103                  * If we're not caching, just tell the caller we
1104                  * were successful and don't touch the route.  The
1105                  * caller hold the sole reference to the cache entry, and
1106                  * it will be released when the caller is done with it.
1107                  * If we drop it here, the callers have no way to resolve routes
1108                  * when we're not caching.  Instead, just point *rp at rt, so
1109                  * the caller gets a single use out of the route
1110                  * Note that we do rt_free on this new route entry, so that
1111                  * once its refcount hits zero, we are still able to reap it
1112                  * (Thanks Alexey)
1113                  * Note: To avoid expensive rcu stuff for this uncached dst,
1114                  * we set DST_NOCACHE so that dst_release() can free dst without
1115                  * waiting a grace period.
1116                  */
1117
1118                 rt->dst.flags |= DST_NOCACHE;
1119                 goto skip_hashing;
1120         }
1121
1122         rthp = &rt_hash_table[hash].chain;
1123
1124         spin_lock_bh(rt_hash_lock_addr(hash));
1125         while ((rth = rcu_dereference_protected(*rthp,
1126                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1127                 if (rt_is_expired(rth)) {
1128                         *rthp = rth->dst.rt_next;
1129                         rt_free(rth);
1130                         continue;
1131                 }
1132                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1133                         /* Put it first */
1134                         *rthp = rth->dst.rt_next;
1135                         /*
1136                          * Since lookup is lockfree, the deletion
1137                          * must be visible to another weakly ordered CPU before
1138                          * the insertion at the start of the hash chain.
1139                          */
1140                         rcu_assign_pointer(rth->dst.rt_next,
1141                                            rt_hash_table[hash].chain);
1142                         /*
1143                          * Since lookup is lockfree, the update writes
1144                          * must be ordered for consistency on SMP.
1145                          */
1146                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147
1148                         dst_use(&rth->dst, now);
1149                         spin_unlock_bh(rt_hash_lock_addr(hash));
1150
1151                         rt_drop(rt);
1152                         if (skb)
1153                                 skb_dst_set(skb, &rth->dst);
1154                         return rth;
1155                 }
1156
1157                 if (!atomic_read(&rth->dst.__refcnt)) {
1158                         u32 score = rt_score(rth);
1159
1160                         if (score <= min_score) {
1161                                 cand = rth;
1162                                 candp = rthp;
1163                                 min_score = score;
1164                         }
1165                 }
1166
1167                 chain_length++;
1168
1169                 rthp = &rth->dst.rt_next;
1170         }
1171
1172         if (cand) {
1173                 /* ip_rt_gc_elasticity used to be average length of chain
1174                  * length, when exceeded gc becomes really aggressive.
1175                  *
1176                  * The second limit is less certain. At the moment it allows
1177                  * only 2 entries per bucket. We will see.
1178                  */
1179                 if (chain_length > ip_rt_gc_elasticity) {
1180                         *candp = cand->dst.rt_next;
1181                         rt_free(cand);
1182                 }
1183         } else {
1184                 if (chain_length > rt_chain_length_max &&
1185                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1186                         struct net *net = dev_net(rt->dst.dev);
1187                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1188                         if (!rt_caching(net)) {
1189                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1190                                         rt->dst.dev->name, num);
1191                         }
1192                         rt_emergency_hash_rebuild(net);
1193                         spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1196                                         ifindex, rt_genid(net));
1197                         goto restart;
1198                 }
1199         }
1200
1201         rt->dst.rt_next = rt_hash_table[hash].chain;
1202
1203         /*
1204          * Since lookup is lockfree, we must make sure
1205          * previous writes to rt are committed to memory
1206          * before making rt visible to other CPUS.
1207          */
1208         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1209
1210         spin_unlock_bh(rt_hash_lock_addr(hash));
1211
1212 skip_hashing:
1213         if (skb)
1214                 skb_dst_set(skb, &rt->dst);
1215         return rt;
1216 }
1217
1218 /*
1219  * Peer allocation may fail only in serious out-of-memory conditions.  However
1220  * we still can generate some output.
1221  * Random ID selection looks a bit dangerous because we have no chances to
1222  * select ID being unique in a reasonable period of time.
1223  * But broken packet identifier may be better than no packet at all.
1224  */
1225 static void ip_select_fb_ident(struct iphdr *iph)
1226 {
1227         static DEFINE_SPINLOCK(ip_fb_id_lock);
1228         static u32 ip_fallback_id;
1229         u32 salt;
1230
1231         spin_lock_bh(&ip_fb_id_lock);
1232         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1233         iph->id = htons(salt & 0xFFFF);
1234         ip_fallback_id = salt;
1235         spin_unlock_bh(&ip_fb_id_lock);
1236 }
1237
1238 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1239 {
1240         struct net *net = dev_net(dst->dev);
1241         struct inet_peer *peer;
1242
1243         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1244         if (peer) {
1245                 iph->id = htons(inet_getid(peer, more));
1246                 inet_putpeer(peer);
1247                 return;
1248         }
1249
1250         ip_select_fb_ident(iph);
1251 }
1252 EXPORT_SYMBOL(__ip_select_ident);
1253
1254 static void rt_del(unsigned int hash, struct rtable *rt)
1255 {
1256         struct rtable __rcu **rthp;
1257         struct rtable *aux;
1258
1259         rthp = &rt_hash_table[hash].chain;
1260         spin_lock_bh(rt_hash_lock_addr(hash));
1261         ip_rt_put(rt);
1262         while ((aux = rcu_dereference_protected(*rthp,
1263                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1264                 if (aux == rt || rt_is_expired(aux)) {
1265                         *rthp = aux->dst.rt_next;
1266                         rt_free(aux);
1267                         continue;
1268                 }
1269                 rthp = &aux->dst.rt_next;
1270         }
1271         spin_unlock_bh(rt_hash_lock_addr(hash));
1272 }
1273
1274 static void ip_do_redirect(struct rtable *rt, __be32 old_gw, __be32 new_gw)
1275 {
1276         struct neighbour *n;
1277
1278         if (rt->rt_gateway != old_gw)
1279                 return;
1280
1281         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1282         if (n) {
1283                 if (!(n->nud_state & NUD_VALID)) {
1284                         neigh_event_send(n, NULL);
1285                 } else {
1286                         rt->rt_gateway = new_gw;
1287                         rt->rt_flags |= RTCF_REDIRECTED;
1288                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1289                 }
1290                 neigh_release(n);
1291         }
1292 }
1293
1294 /* called in rcu_read_lock() section */
1295 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1296                     __be32 saddr, struct net_device *dev)
1297 {
1298         int s, i;
1299         struct in_device *in_dev = __in_dev_get_rcu(dev);
1300         __be32 skeys[2] = { saddr, 0 };
1301         int    ikeys[2] = { dev->ifindex, 0 };
1302         struct net *net;
1303
1304         if (!in_dev)
1305                 return;
1306
1307         net = dev_net(dev);
1308         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1309             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1310             ipv4_is_zeronet(new_gw))
1311                 goto reject_redirect;
1312
1313         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1314                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1315                         goto reject_redirect;
1316                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1317                         goto reject_redirect;
1318         } else {
1319                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1320                         goto reject_redirect;
1321         }
1322
1323         for (s = 0; s < 2; s++) {
1324                 for (i = 0; i < 2; i++) {
1325                         unsigned int hash;
1326                         struct rtable __rcu **rthp;
1327                         struct rtable *rt;
1328
1329                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1330
1331                         rthp = &rt_hash_table[hash].chain;
1332
1333                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1334                                 rthp = &rt->dst.rt_next;
1335
1336                                 if (rt->rt_key_dst != daddr ||
1337                                     rt->rt_key_src != skeys[s] ||
1338                                     rt->rt_oif != ikeys[i] ||
1339                                     rt_is_input_route(rt) ||
1340                                     rt_is_expired(rt) ||
1341                                     !net_eq(dev_net(rt->dst.dev), net) ||
1342                                     rt->dst.error ||
1343                                     rt->dst.dev != dev)
1344                                         continue;
1345
1346                                 ip_do_redirect(rt, old_gw, new_gw);
1347                         }
1348                 }
1349         }
1350         return;
1351
1352 reject_redirect:
1353 #ifdef CONFIG_IP_ROUTE_VERBOSE
1354         if (IN_DEV_LOG_MARTIANS(in_dev))
1355                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1356                                      "  Advised path = %pI4 -> %pI4\n",
1357                                      &old_gw, dev->name, &new_gw,
1358                                      &saddr, &daddr);
1359 #endif
1360         ;
1361 }
1362
1363 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1364 {
1365         struct rtable *rt = (struct rtable *)dst;
1366         struct dst_entry *ret = dst;
1367
1368         if (rt) {
1369                 if (dst->obsolete > 0) {
1370                         ip_rt_put(rt);
1371                         ret = NULL;
1372                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1373                            rt->dst.expires) {
1374                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1375                                                 rt->rt_oif,
1376                                                 rt_genid(dev_net(dst->dev)));
1377                         rt_del(hash, rt);
1378                         ret = NULL;
1379                 }
1380         }
1381         return ret;
1382 }
1383
1384 /*
1385  * Algorithm:
1386  *      1. The first ip_rt_redirect_number redirects are sent
1387  *         with exponential backoff, then we stop sending them at all,
1388  *         assuming that the host ignores our redirects.
1389  *      2. If we did not see packets requiring redirects
1390  *         during ip_rt_redirect_silence, we assume that the host
1391  *         forgot redirected route and start to send redirects again.
1392  *
1393  * This algorithm is much cheaper and more intelligent than dumb load limiting
1394  * in icmp.c.
1395  *
1396  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1397  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1398  */
1399
1400 void ip_rt_send_redirect(struct sk_buff *skb)
1401 {
1402         struct rtable *rt = skb_rtable(skb);
1403         struct in_device *in_dev;
1404         struct inet_peer *peer;
1405         struct net *net;
1406         int log_martians;
1407
1408         rcu_read_lock();
1409         in_dev = __in_dev_get_rcu(rt->dst.dev);
1410         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1411                 rcu_read_unlock();
1412                 return;
1413         }
1414         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1415         rcu_read_unlock();
1416
1417         net = dev_net(rt->dst.dev);
1418         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1419         if (!peer) {
1420                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1421                 return;
1422         }
1423
1424         /* No redirected packets during ip_rt_redirect_silence;
1425          * reset the algorithm.
1426          */
1427         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1428                 peer->rate_tokens = 0;
1429
1430         /* Too many ignored redirects; do not send anything
1431          * set dst.rate_last to the last seen redirected packet.
1432          */
1433         if (peer->rate_tokens >= ip_rt_redirect_number) {
1434                 peer->rate_last = jiffies;
1435                 goto out_put_peer;
1436         }
1437
1438         /* Check for load limit; set rate_last to the latest sent
1439          * redirect.
1440          */
1441         if (peer->rate_tokens == 0 ||
1442             time_after(jiffies,
1443                        (peer->rate_last +
1444                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1445                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1446                 peer->rate_last = jiffies;
1447                 ++peer->rate_tokens;
1448 #ifdef CONFIG_IP_ROUTE_VERBOSE
1449                 if (log_martians &&
1450                     peer->rate_tokens == ip_rt_redirect_number)
1451                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1452                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1453                                              &rt->rt_dst, &rt->rt_gateway);
1454 #endif
1455         }
1456 out_put_peer:
1457         inet_putpeer(peer);
1458 }
1459
1460 static int ip_error(struct sk_buff *skb)
1461 {
1462         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1463         struct rtable *rt = skb_rtable(skb);
1464         struct inet_peer *peer;
1465         unsigned long now;
1466         struct net *net;
1467         bool send;
1468         int code;
1469
1470         net = dev_net(rt->dst.dev);
1471         if (!IN_DEV_FORWARD(in_dev)) {
1472                 switch (rt->dst.error) {
1473                 case EHOSTUNREACH:
1474                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1475                         break;
1476
1477                 case ENETUNREACH:
1478                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1479                         break;
1480                 }
1481                 goto out;
1482         }
1483
1484         switch (rt->dst.error) {
1485         case EINVAL:
1486         default:
1487                 goto out;
1488         case EHOSTUNREACH:
1489                 code = ICMP_HOST_UNREACH;
1490                 break;
1491         case ENETUNREACH:
1492                 code = ICMP_NET_UNREACH;
1493                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1494                 break;
1495         case EACCES:
1496                 code = ICMP_PKT_FILTERED;
1497                 break;
1498         }
1499
1500         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1501
1502         send = true;
1503         if (peer) {
1504                 now = jiffies;
1505                 peer->rate_tokens += now - peer->rate_last;
1506                 if (peer->rate_tokens > ip_rt_error_burst)
1507                         peer->rate_tokens = ip_rt_error_burst;
1508                 peer->rate_last = now;
1509                 if (peer->rate_tokens >= ip_rt_error_cost)
1510                         peer->rate_tokens -= ip_rt_error_cost;
1511                 else
1512                         send = false;
1513                 inet_putpeer(peer);
1514         }
1515         if (send)
1516                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1517
1518 out:    kfree_skb(skb);
1519         return 0;
1520 }
1521
1522 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1523 {
1524         struct rtable *rt = (struct rtable *) dst;
1525
1526         dst_confirm(dst);
1527
1528         if (mtu < ip_rt_min_pmtu)
1529                 mtu = ip_rt_min_pmtu;
1530
1531         rt->rt_pmtu = mtu;
1532         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1533 }
1534
1535 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1536                       int oif, u32 mark, u8 protocol, int flow_flags)
1537 {
1538         const struct iphdr *iph = (const struct iphdr *)skb->data;
1539         struct flowi4 fl4;
1540         struct rtable *rt;
1541
1542         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1543                            protocol, flow_flags,
1544                            iph->daddr, iph->saddr, 0, 0);
1545         rt = __ip_route_output_key(net, &fl4);
1546         if (!IS_ERR(rt)) {
1547                 ip_rt_update_pmtu(&rt->dst, mtu);
1548                 ip_rt_put(rt);
1549         }
1550 }
1551 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1552
1553 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1554 {
1555         const struct inet_sock *inet = inet_sk(sk);
1556
1557         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1558                                 sk->sk_bound_dev_if, sk->sk_mark,
1559                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1560                                 inet_sk_flowi_flags(sk));
1561 }
1562 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1563
1564 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1565 {
1566         struct rtable *rt = (struct rtable *) dst;
1567
1568         if (rt_is_expired(rt))
1569                 return NULL;
1570         return dst;
1571 }
1572
1573 static void ipv4_dst_destroy(struct dst_entry *dst)
1574 {
1575         struct rtable *rt = (struct rtable *) dst;
1576
1577         if (rt->fi) {
1578                 fib_info_put(rt->fi);
1579                 rt->fi = NULL;
1580         }
1581 }
1582
1583
1584 static void ipv4_link_failure(struct sk_buff *skb)
1585 {
1586         struct rtable *rt;
1587
1588         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1589
1590         rt = skb_rtable(skb);
1591         if (rt)
1592                 dst_set_expires(&rt->dst, 0);
1593 }
1594
1595 static int ip_rt_bug(struct sk_buff *skb)
1596 {
1597         pr_debug("%s: %pI4 -> %pI4, %s\n",
1598                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1599                  skb->dev ? skb->dev->name : "?");
1600         kfree_skb(skb);
1601         WARN_ON(1);
1602         return 0;
1603 }
1604
1605 /*
1606    We do not cache source address of outgoing interface,
1607    because it is used only by IP RR, TS and SRR options,
1608    so that it out of fast path.
1609
1610    BTW remember: "addr" is allowed to be not aligned
1611    in IP options!
1612  */
1613
1614 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1615 {
1616         __be32 src;
1617
1618         if (rt_is_output_route(rt))
1619                 src = ip_hdr(skb)->saddr;
1620         else {
1621                 struct fib_result res;
1622                 struct flowi4 fl4;
1623                 struct iphdr *iph;
1624
1625                 iph = ip_hdr(skb);
1626
1627                 memset(&fl4, 0, sizeof(fl4));
1628                 fl4.daddr = iph->daddr;
1629                 fl4.saddr = iph->saddr;
1630                 fl4.flowi4_tos = RT_TOS(iph->tos);
1631                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1632                 fl4.flowi4_iif = skb->dev->ifindex;
1633                 fl4.flowi4_mark = skb->mark;
1634
1635                 rcu_read_lock();
1636                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1637                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1638                 else
1639                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1640                                         RT_SCOPE_UNIVERSE);
1641                 rcu_read_unlock();
1642         }
1643         memcpy(addr, &src, 4);
1644 }
1645
1646 #ifdef CONFIG_IP_ROUTE_CLASSID
1647 static void set_class_tag(struct rtable *rt, u32 tag)
1648 {
1649         if (!(rt->dst.tclassid & 0xFFFF))
1650                 rt->dst.tclassid |= tag & 0xFFFF;
1651         if (!(rt->dst.tclassid & 0xFFFF0000))
1652                 rt->dst.tclassid |= tag & 0xFFFF0000;
1653 }
1654 #endif
1655
1656 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1657 {
1658         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1659
1660         if (advmss == 0) {
1661                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1662                                ip_rt_min_advmss);
1663                 if (advmss > 65535 - 40)
1664                         advmss = 65535 - 40;
1665         }
1666         return advmss;
1667 }
1668
1669 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1670 {
1671         const struct rtable *rt = (const struct rtable *) dst;
1672         unsigned int mtu = rt->rt_pmtu;
1673
1674         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1675                 mtu = 0;
1676
1677         if (!mtu)
1678                 mtu = dst_metric_raw(dst, RTAX_MTU);
1679
1680         if (mtu && rt_is_output_route(rt))
1681                 return mtu;
1682
1683         mtu = dst->dev->mtu;
1684
1685         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1686
1687                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1688                         mtu = 576;
1689         }
1690
1691         if (mtu > IP_MAX_MTU)
1692                 mtu = IP_MAX_MTU;
1693
1694         return mtu;
1695 }
1696
1697 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1698                             struct fib_info *fi)
1699 {
1700         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1701                 rt->fi = fi;
1702                 atomic_inc(&fi->fib_clntref);
1703         }
1704         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1705 }
1706
1707 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1708                            const struct fib_result *res,
1709                            struct fib_info *fi, u16 type, u32 itag)
1710 {
1711         if (fi) {
1712                 if (FIB_RES_GW(*res) &&
1713                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1714                         rt->rt_gateway = FIB_RES_GW(*res);
1715                 rt_init_metrics(rt, fl4, fi);
1716 #ifdef CONFIG_IP_ROUTE_CLASSID
1717                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1718 #endif
1719         }
1720
1721 #ifdef CONFIG_IP_ROUTE_CLASSID
1722 #ifdef CONFIG_IP_MULTIPLE_TABLES
1723         set_class_tag(rt, fib_rules_tclass(res));
1724 #endif
1725         set_class_tag(rt, itag);
1726 #endif
1727 }
1728
1729 static struct rtable *rt_dst_alloc(struct net_device *dev,
1730                                    bool nopolicy, bool noxfrm)
1731 {
1732         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1733                          DST_HOST |
1734                          (nopolicy ? DST_NOPOLICY : 0) |
1735                          (noxfrm ? DST_NOXFRM : 0));
1736 }
1737
1738 /* called in rcu_read_lock() section */
1739 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1740                                 u8 tos, struct net_device *dev, int our)
1741 {
1742         unsigned int hash;
1743         struct rtable *rth;
1744         struct in_device *in_dev = __in_dev_get_rcu(dev);
1745         u32 itag = 0;
1746         int err;
1747
1748         /* Primary sanity checks. */
1749
1750         if (in_dev == NULL)
1751                 return -EINVAL;
1752
1753         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1754             skb->protocol != htons(ETH_P_IP))
1755                 goto e_inval;
1756
1757         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1758                 if (ipv4_is_loopback(saddr))
1759                         goto e_inval;
1760
1761         if (ipv4_is_zeronet(saddr)) {
1762                 if (!ipv4_is_local_multicast(daddr))
1763                         goto e_inval;
1764         } else {
1765                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1766                                           in_dev, &itag);
1767                 if (err < 0)
1768                         goto e_err;
1769         }
1770         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1771                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1772         if (!rth)
1773                 goto e_nobufs;
1774
1775 #ifdef CONFIG_IP_ROUTE_CLASSID
1776         rth->dst.tclassid = itag;
1777 #endif
1778         rth->dst.output = ip_rt_bug;
1779
1780         rth->rt_key_dst = daddr;
1781         rth->rt_key_src = saddr;
1782         rth->rt_genid   = rt_genid(dev_net(dev));
1783         rth->rt_flags   = RTCF_MULTICAST;
1784         rth->rt_type    = RTN_MULTICAST;
1785         rth->rt_key_tos = tos;
1786         rth->rt_dst     = daddr;
1787         rth->rt_src     = saddr;
1788         rth->rt_route_iif = dev->ifindex;
1789         rth->rt_iif     = dev->ifindex;
1790         rth->rt_oif     = 0;
1791         rth->rt_mark    = skb->mark;
1792         rth->rt_pmtu    = 0;
1793         rth->rt_gateway = daddr;
1794         rth->fi = NULL;
1795         if (our) {
1796                 rth->dst.input= ip_local_deliver;
1797                 rth->rt_flags |= RTCF_LOCAL;
1798         }
1799
1800 #ifdef CONFIG_IP_MROUTE
1801         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1802                 rth->dst.input = ip_mr_input;
1803 #endif
1804         RT_CACHE_STAT_INC(in_slow_mc);
1805
1806         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1807         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1808         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1809
1810 e_nobufs:
1811         return -ENOBUFS;
1812 e_inval:
1813         return -EINVAL;
1814 e_err:
1815         return err;
1816 }
1817
1818
1819 static void ip_handle_martian_source(struct net_device *dev,
1820                                      struct in_device *in_dev,
1821                                      struct sk_buff *skb,
1822                                      __be32 daddr,
1823                                      __be32 saddr)
1824 {
1825         RT_CACHE_STAT_INC(in_martian_src);
1826 #ifdef CONFIG_IP_ROUTE_VERBOSE
1827         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1828                 /*
1829                  *      RFC1812 recommendation, if source is martian,
1830                  *      the only hint is MAC header.
1831                  */
1832                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1833                         &daddr, &saddr, dev->name);
1834                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1835                         print_hex_dump(KERN_WARNING, "ll header: ",
1836                                        DUMP_PREFIX_OFFSET, 16, 1,
1837                                        skb_mac_header(skb),
1838                                        dev->hard_header_len, true);
1839                 }
1840         }
1841 #endif
1842 }
1843
1844 /* called in rcu_read_lock() section */
1845 static int __mkroute_input(struct sk_buff *skb,
1846                            const struct fib_result *res,
1847                            struct in_device *in_dev,
1848                            __be32 daddr, __be32 saddr, u32 tos,
1849                            struct rtable **result)
1850 {
1851         struct rtable *rth;
1852         int err;
1853         struct in_device *out_dev;
1854         unsigned int flags = 0;
1855         u32 itag;
1856
1857         /* get a working reference to the output device */
1858         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1859         if (out_dev == NULL) {
1860                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1861                 return -EINVAL;
1862         }
1863
1864
1865         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1866                                   in_dev->dev, in_dev, &itag);
1867         if (err < 0) {
1868                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1869                                          saddr);
1870
1871                 goto cleanup;
1872         }
1873
1874         if (err)
1875                 flags |= RTCF_DIRECTSRC;
1876
1877         if (out_dev == in_dev && err &&
1878             (IN_DEV_SHARED_MEDIA(out_dev) ||
1879              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1880                 flags |= RTCF_DOREDIRECT;
1881
1882         if (skb->protocol != htons(ETH_P_IP)) {
1883                 /* Not IP (i.e. ARP). Do not create route, if it is
1884                  * invalid for proxy arp. DNAT routes are always valid.
1885                  *
1886                  * Proxy arp feature have been extended to allow, ARP
1887                  * replies back to the same interface, to support
1888                  * Private VLAN switch technologies. See arp.c.
1889                  */
1890                 if (out_dev == in_dev &&
1891                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1892                         err = -EINVAL;
1893                         goto cleanup;
1894                 }
1895         }
1896
1897         rth = rt_dst_alloc(out_dev->dev,
1898                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1899                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1900         if (!rth) {
1901                 err = -ENOBUFS;
1902                 goto cleanup;
1903         }
1904
1905         rth->rt_key_dst = daddr;
1906         rth->rt_key_src = saddr;
1907         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1908         rth->rt_flags = flags;
1909         rth->rt_type = res->type;
1910         rth->rt_key_tos = tos;
1911         rth->rt_dst     = daddr;
1912         rth->rt_src     = saddr;
1913         rth->rt_route_iif = in_dev->dev->ifindex;
1914         rth->rt_iif     = in_dev->dev->ifindex;
1915         rth->rt_oif     = 0;
1916         rth->rt_mark    = skb->mark;
1917         rth->rt_pmtu    = 0;
1918         rth->rt_gateway = daddr;
1919         rth->fi = NULL;
1920
1921         rth->dst.input = ip_forward;
1922         rth->dst.output = ip_output;
1923
1924         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1925
1926         *result = rth;
1927         err = 0;
1928  cleanup:
1929         return err;
1930 }
1931
1932 static int ip_mkroute_input(struct sk_buff *skb,
1933                             struct fib_result *res,
1934                             const struct flowi4 *fl4,
1935                             struct in_device *in_dev,
1936                             __be32 daddr, __be32 saddr, u32 tos)
1937 {
1938         struct rtable *rth = NULL;
1939         int err;
1940         unsigned int hash;
1941
1942 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1943         if (res->fi && res->fi->fib_nhs > 1)
1944                 fib_select_multipath(res);
1945 #endif
1946
1947         /* create a routing cache entry */
1948         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1949         if (err)
1950                 return err;
1951
1952         /* put it into the cache */
1953         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
1954                        rt_genid(dev_net(rth->dst.dev)));
1955         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
1956         if (IS_ERR(rth))
1957                 return PTR_ERR(rth);
1958         return 0;
1959 }
1960
1961 /*
1962  *      NOTE. We drop all the packets that has local source
1963  *      addresses, because every properly looped back packet
1964  *      must have correct destination already attached by output routine.
1965  *
1966  *      Such approach solves two big problems:
1967  *      1. Not simplex devices are handled properly.
1968  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1969  *      called with rcu_read_lock()
1970  */
1971
1972 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1973                                u8 tos, struct net_device *dev)
1974 {
1975         struct fib_result res;
1976         struct in_device *in_dev = __in_dev_get_rcu(dev);
1977         struct flowi4   fl4;
1978         unsigned int    flags = 0;
1979         u32             itag = 0;
1980         struct rtable   *rth;
1981         unsigned int    hash;
1982         int             err = -EINVAL;
1983         struct net    *net = dev_net(dev);
1984
1985         /* IP on this device is disabled. */
1986
1987         if (!in_dev)
1988                 goto out;
1989
1990         /* Check for the most weird martians, which can be not detected
1991            by fib_lookup.
1992          */
1993
1994         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1995                 goto martian_source;
1996
1997         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1998                 goto brd_input;
1999
2000         /* Accept zero addresses only to limited broadcast;
2001          * I even do not know to fix it or not. Waiting for complains :-)
2002          */
2003         if (ipv4_is_zeronet(saddr))
2004                 goto martian_source;
2005
2006         if (ipv4_is_zeronet(daddr))
2007                 goto martian_destination;
2008
2009         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2010                 if (ipv4_is_loopback(daddr))
2011                         goto martian_destination;
2012
2013                 if (ipv4_is_loopback(saddr))
2014                         goto martian_source;
2015         }
2016
2017         /*
2018          *      Now we are ready to route packet.
2019          */
2020         fl4.flowi4_oif = 0;
2021         fl4.flowi4_iif = dev->ifindex;
2022         fl4.flowi4_mark = skb->mark;
2023         fl4.flowi4_tos = tos;
2024         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2025         fl4.daddr = daddr;
2026         fl4.saddr = saddr;
2027         err = fib_lookup(net, &fl4, &res);
2028         if (err != 0)
2029                 goto no_route;
2030
2031         RT_CACHE_STAT_INC(in_slow_tot);
2032
2033         if (res.type == RTN_BROADCAST)
2034                 goto brd_input;
2035
2036         if (res.type == RTN_LOCAL) {
2037                 err = fib_validate_source(skb, saddr, daddr, tos,
2038                                           net->loopback_dev->ifindex,
2039                                           dev, in_dev, &itag);
2040                 if (err < 0)
2041                         goto martian_source_keep_err;
2042                 if (err)
2043                         flags |= RTCF_DIRECTSRC;
2044                 goto local_input;
2045         }
2046
2047         if (!IN_DEV_FORWARD(in_dev))
2048                 goto no_route;
2049         if (res.type != RTN_UNICAST)
2050                 goto martian_destination;
2051
2052         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2053 out:    return err;
2054
2055 brd_input:
2056         if (skb->protocol != htons(ETH_P_IP))
2057                 goto e_inval;
2058
2059         if (!ipv4_is_zeronet(saddr)) {
2060                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2061                                           in_dev, &itag);
2062                 if (err < 0)
2063                         goto martian_source_keep_err;
2064                 if (err)
2065                         flags |= RTCF_DIRECTSRC;
2066         }
2067         flags |= RTCF_BROADCAST;
2068         res.type = RTN_BROADCAST;
2069         RT_CACHE_STAT_INC(in_brd);
2070
2071 local_input:
2072         rth = rt_dst_alloc(net->loopback_dev,
2073                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2074         if (!rth)
2075                 goto e_nobufs;
2076
2077         rth->dst.input= ip_local_deliver;
2078         rth->dst.output= ip_rt_bug;
2079 #ifdef CONFIG_IP_ROUTE_CLASSID
2080         rth->dst.tclassid = itag;
2081 #endif
2082
2083         rth->rt_key_dst = daddr;
2084         rth->rt_key_src = saddr;
2085         rth->rt_genid = rt_genid(net);
2086         rth->rt_flags   = flags|RTCF_LOCAL;
2087         rth->rt_type    = res.type;
2088         rth->rt_key_tos = tos;
2089         rth->rt_dst     = daddr;
2090         rth->rt_src     = saddr;
2091         rth->rt_route_iif = dev->ifindex;
2092         rth->rt_iif     = dev->ifindex;
2093         rth->rt_oif     = 0;
2094         rth->rt_mark    = skb->mark;
2095         rth->rt_pmtu    = 0;
2096         rth->rt_gateway = daddr;
2097         rth->fi = NULL;
2098         if (res.type == RTN_UNREACHABLE) {
2099                 rth->dst.input= ip_error;
2100                 rth->dst.error= -err;
2101                 rth->rt_flags   &= ~RTCF_LOCAL;
2102         }
2103         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2104         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2105         err = 0;
2106         if (IS_ERR(rth))
2107                 err = PTR_ERR(rth);
2108         goto out;
2109
2110 no_route:
2111         RT_CACHE_STAT_INC(in_no_route);
2112         res.type = RTN_UNREACHABLE;
2113         if (err == -ESRCH)
2114                 err = -ENETUNREACH;
2115         goto local_input;
2116
2117         /*
2118          *      Do not cache martian addresses: they should be logged (RFC1812)
2119          */
2120 martian_destination:
2121         RT_CACHE_STAT_INC(in_martian_dst);
2122 #ifdef CONFIG_IP_ROUTE_VERBOSE
2123         if (IN_DEV_LOG_MARTIANS(in_dev))
2124                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2125                                      &daddr, &saddr, dev->name);
2126 #endif
2127
2128 e_inval:
2129         err = -EINVAL;
2130         goto out;
2131
2132 e_nobufs:
2133         err = -ENOBUFS;
2134         goto out;
2135
2136 martian_source:
2137         err = -EINVAL;
2138 martian_source_keep_err:
2139         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2140         goto out;
2141 }
2142
2143 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2144                            u8 tos, struct net_device *dev, bool noref)
2145 {
2146         struct rtable   *rth;
2147         unsigned int    hash;
2148         int iif = dev->ifindex;
2149         struct net *net;
2150         int res;
2151
2152         net = dev_net(dev);
2153
2154         rcu_read_lock();
2155
2156         if (!rt_caching(net))
2157                 goto skip_cache;
2158
2159         tos &= IPTOS_RT_MASK;
2160         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2161
2162         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2163              rth = rcu_dereference(rth->dst.rt_next)) {
2164                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2165                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2166                      (rth->rt_route_iif ^ iif) |
2167                      (rth->rt_key_tos ^ tos)) == 0 &&
2168                     rth->rt_mark == skb->mark &&
2169                     net_eq(dev_net(rth->dst.dev), net) &&
2170                     !rt_is_expired(rth)) {
2171                         if (noref) {
2172                                 dst_use_noref(&rth->dst, jiffies);
2173                                 skb_dst_set_noref(skb, &rth->dst);
2174                         } else {
2175                                 dst_use(&rth->dst, jiffies);
2176                                 skb_dst_set(skb, &rth->dst);
2177                         }
2178                         RT_CACHE_STAT_INC(in_hit);
2179                         rcu_read_unlock();
2180                         return 0;
2181                 }
2182                 RT_CACHE_STAT_INC(in_hlist_search);
2183         }
2184
2185 skip_cache:
2186         /* Multicast recognition logic is moved from route cache to here.
2187            The problem was that too many Ethernet cards have broken/missing
2188            hardware multicast filters :-( As result the host on multicasting
2189            network acquires a lot of useless route cache entries, sort of
2190            SDR messages from all the world. Now we try to get rid of them.
2191            Really, provided software IP multicast filter is organized
2192            reasonably (at least, hashed), it does not result in a slowdown
2193            comparing with route cache reject entries.
2194            Note, that multicast routers are not affected, because
2195            route cache entry is created eventually.
2196          */
2197         if (ipv4_is_multicast(daddr)) {
2198                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2199
2200                 if (in_dev) {
2201                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2202                                                   ip_hdr(skb)->protocol);
2203                         if (our
2204 #ifdef CONFIG_IP_MROUTE
2205                                 ||
2206                             (!ipv4_is_local_multicast(daddr) &&
2207                              IN_DEV_MFORWARD(in_dev))
2208 #endif
2209                            ) {
2210                                 int res = ip_route_input_mc(skb, daddr, saddr,
2211                                                             tos, dev, our);
2212                                 rcu_read_unlock();
2213                                 return res;
2214                         }
2215                 }
2216                 rcu_read_unlock();
2217                 return -EINVAL;
2218         }
2219         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2220         rcu_read_unlock();
2221         return res;
2222 }
2223 EXPORT_SYMBOL(ip_route_input_common);
2224
2225 /* called with rcu_read_lock() */
2226 static struct rtable *__mkroute_output(const struct fib_result *res,
2227                                        const struct flowi4 *fl4,
2228                                        __be32 orig_daddr, __be32 orig_saddr,
2229                                        int orig_oif, __u8 orig_rtos,
2230                                        struct net_device *dev_out,
2231                                        unsigned int flags)
2232 {
2233         struct fib_info *fi = res->fi;
2234         struct in_device *in_dev;
2235         u16 type = res->type;
2236         struct rtable *rth;
2237
2238         in_dev = __in_dev_get_rcu(dev_out);
2239         if (!in_dev)
2240                 return ERR_PTR(-EINVAL);
2241
2242         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2243                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2244                         return ERR_PTR(-EINVAL);
2245
2246         if (ipv4_is_lbcast(fl4->daddr))
2247                 type = RTN_BROADCAST;
2248         else if (ipv4_is_multicast(fl4->daddr))
2249                 type = RTN_MULTICAST;
2250         else if (ipv4_is_zeronet(fl4->daddr))
2251                 return ERR_PTR(-EINVAL);
2252
2253         if (dev_out->flags & IFF_LOOPBACK)
2254                 flags |= RTCF_LOCAL;
2255
2256         if (type == RTN_BROADCAST) {
2257                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2258                 fi = NULL;
2259         } else if (type == RTN_MULTICAST) {
2260                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2261                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2262                                      fl4->flowi4_proto))
2263                         flags &= ~RTCF_LOCAL;
2264                 /* If multicast route do not exist use
2265                  * default one, but do not gateway in this case.
2266                  * Yes, it is hack.
2267                  */
2268                 if (fi && res->prefixlen < 4)
2269                         fi = NULL;
2270         }
2271
2272         rth = rt_dst_alloc(dev_out,
2273                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2274                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2275         if (!rth)
2276                 return ERR_PTR(-ENOBUFS);
2277
2278         rth->dst.output = ip_output;
2279
2280         rth->rt_key_dst = orig_daddr;
2281         rth->rt_key_src = orig_saddr;
2282         rth->rt_genid = rt_genid(dev_net(dev_out));
2283         rth->rt_flags   = flags;
2284         rth->rt_type    = type;
2285         rth->rt_key_tos = orig_rtos;
2286         rth->rt_dst     = fl4->daddr;
2287         rth->rt_src     = fl4->saddr;
2288         rth->rt_route_iif = 0;
2289         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2290         rth->rt_oif     = orig_oif;
2291         rth->rt_mark    = fl4->flowi4_mark;
2292         rth->rt_pmtu    = 0;
2293         rth->rt_gateway = fl4->daddr;
2294         rth->fi = NULL;
2295
2296         RT_CACHE_STAT_INC(out_slow_tot);
2297
2298         if (flags & RTCF_LOCAL)
2299                 rth->dst.input = ip_local_deliver;
2300         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2301                 if (flags & RTCF_LOCAL &&
2302                     !(dev_out->flags & IFF_LOOPBACK)) {
2303                         rth->dst.output = ip_mc_output;
2304                         RT_CACHE_STAT_INC(out_slow_mc);
2305                 }
2306 #ifdef CONFIG_IP_MROUTE
2307                 if (type == RTN_MULTICAST) {
2308                         if (IN_DEV_MFORWARD(in_dev) &&
2309                             !ipv4_is_local_multicast(fl4->daddr)) {
2310                                 rth->dst.input = ip_mr_input;
2311                                 rth->dst.output = ip_mc_output;
2312                         }
2313                 }
2314 #endif
2315         }
2316
2317         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2318
2319         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2320                 rth->dst.flags |= DST_NOCACHE;
2321
2322         return rth;
2323 }
2324
2325 /*
2326  * Major route resolver routine.
2327  * called with rcu_read_lock();
2328  */
2329
2330 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2331 {
2332         struct net_device *dev_out = NULL;
2333         __u8 tos = RT_FL_TOS(fl4);
2334         unsigned int flags = 0;
2335         struct fib_result res;
2336         struct rtable *rth;
2337         __be32 orig_daddr;
2338         __be32 orig_saddr;
2339         int orig_oif;
2340
2341         res.fi          = NULL;
2342         res.table       = NULL;
2343 #ifdef CONFIG_IP_MULTIPLE_TABLES
2344         res.r           = NULL;
2345 #endif
2346
2347         orig_daddr = fl4->daddr;
2348         orig_saddr = fl4->saddr;
2349         orig_oif = fl4->flowi4_oif;
2350
2351         fl4->flowi4_iif = net->loopback_dev->ifindex;
2352         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2353         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2354                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2355
2356         rcu_read_lock();
2357         if (fl4->saddr) {
2358                 rth = ERR_PTR(-EINVAL);
2359                 if (ipv4_is_multicast(fl4->saddr) ||
2360                     ipv4_is_lbcast(fl4->saddr) ||
2361                     ipv4_is_zeronet(fl4->saddr))
2362                         goto out;
2363
2364                 /* I removed check for oif == dev_out->oif here.
2365                    It was wrong for two reasons:
2366                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2367                       is assigned to multiple interfaces.
2368                    2. Moreover, we are allowed to send packets with saddr
2369                       of another iface. --ANK
2370                  */
2371
2372                 if (fl4->flowi4_oif == 0 &&
2373                     (ipv4_is_multicast(fl4->daddr) ||
2374                      ipv4_is_lbcast(fl4->daddr))) {
2375                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2376                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2377                         if (dev_out == NULL)
2378                                 goto out;
2379
2380                         /* Special hack: user can direct multicasts
2381                            and limited broadcast via necessary interface
2382                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2383                            This hack is not just for fun, it allows
2384                            vic,vat and friends to work.
2385                            They bind socket to loopback, set ttl to zero
2386                            and expect that it will work.
2387                            From the viewpoint of routing cache they are broken,
2388                            because we are not allowed to build multicast path
2389                            with loopback source addr (look, routing cache
2390                            cannot know, that ttl is zero, so that packet
2391                            will not leave this host and route is valid).
2392                            Luckily, this hack is good workaround.
2393                          */
2394
2395                         fl4->flowi4_oif = dev_out->ifindex;
2396                         goto make_route;
2397                 }
2398
2399                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2400                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2401                         if (!__ip_dev_find(net, fl4->saddr, false))
2402                                 goto out;
2403                 }
2404         }
2405
2406
2407         if (fl4->flowi4_oif) {
2408                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2409                 rth = ERR_PTR(-ENODEV);
2410                 if (dev_out == NULL)
2411                         goto out;
2412
2413                 /* RACE: Check return value of inet_select_addr instead. */
2414                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2415                         rth = ERR_PTR(-ENETUNREACH);
2416                         goto out;
2417                 }
2418                 if (ipv4_is_local_multicast(fl4->daddr) ||
2419                     ipv4_is_lbcast(fl4->daddr)) {
2420                         if (!fl4->saddr)
2421                                 fl4->saddr = inet_select_addr(dev_out, 0,
2422                                                               RT_SCOPE_LINK);
2423                         goto make_route;
2424                 }
2425                 if (fl4->saddr) {
2426                         if (ipv4_is_multicast(fl4->daddr))
2427                                 fl4->saddr = inet_select_addr(dev_out, 0,
2428                                                               fl4->flowi4_scope);
2429                         else if (!fl4->daddr)
2430                                 fl4->saddr = inet_select_addr(dev_out, 0,
2431                                                               RT_SCOPE_HOST);
2432                 }
2433         }
2434
2435         if (!fl4->daddr) {
2436                 fl4->daddr = fl4->saddr;
2437                 if (!fl4->daddr)
2438                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2439                 dev_out = net->loopback_dev;
2440                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2441                 res.type = RTN_LOCAL;
2442                 flags |= RTCF_LOCAL;
2443                 goto make_route;
2444         }
2445
2446         if (fib_lookup(net, fl4, &res)) {
2447                 res.fi = NULL;
2448                 res.table = NULL;
2449                 if (fl4->flowi4_oif) {
2450                         /* Apparently, routing tables are wrong. Assume,
2451                            that the destination is on link.
2452
2453                            WHY? DW.
2454                            Because we are allowed to send to iface
2455                            even if it has NO routes and NO assigned
2456                            addresses. When oif is specified, routing
2457                            tables are looked up with only one purpose:
2458                            to catch if destination is gatewayed, rather than
2459                            direct. Moreover, if MSG_DONTROUTE is set,
2460                            we send packet, ignoring both routing tables
2461                            and ifaddr state. --ANK
2462
2463
2464                            We could make it even if oif is unknown,
2465                            likely IPv6, but we do not.
2466                          */
2467
2468                         if (fl4->saddr == 0)
2469                                 fl4->saddr = inet_select_addr(dev_out, 0,
2470                                                               RT_SCOPE_LINK);
2471                         res.type = RTN_UNICAST;
2472                         goto make_route;
2473                 }
2474                 rth = ERR_PTR(-ENETUNREACH);
2475                 goto out;
2476         }
2477
2478         if (res.type == RTN_LOCAL) {
2479                 if (!fl4->saddr) {
2480                         if (res.fi->fib_prefsrc)
2481                                 fl4->saddr = res.fi->fib_prefsrc;
2482                         else
2483                                 fl4->saddr = fl4->daddr;
2484                 }
2485                 dev_out = net->loopback_dev;
2486                 fl4->flowi4_oif = dev_out->ifindex;
2487                 res.fi = NULL;
2488                 flags |= RTCF_LOCAL;
2489                 goto make_route;
2490         }
2491
2492 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2493         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2494                 fib_select_multipath(&res);
2495         else
2496 #endif
2497         if (!res.prefixlen &&
2498             res.table->tb_num_default > 1 &&
2499             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2500                 fib_select_default(&res);
2501
2502         if (!fl4->saddr)
2503                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2504
2505         dev_out = FIB_RES_DEV(res);
2506         fl4->flowi4_oif = dev_out->ifindex;
2507
2508
2509 make_route:
2510         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2511                                tos, dev_out, flags);
2512         if (!IS_ERR(rth)) {
2513                 unsigned int hash;
2514
2515                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2516                                rt_genid(dev_net(dev_out)));
2517                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2518         }
2519
2520 out:
2521         rcu_read_unlock();
2522         return rth;
2523 }
2524
2525 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2526 {
2527         struct rtable *rth;
2528         unsigned int hash;
2529
2530         if (!rt_caching(net))
2531                 goto slow_output;
2532
2533         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2534
2535         rcu_read_lock_bh();
2536         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2537                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2538                 if (rth->rt_key_dst == flp4->daddr &&
2539                     rth->rt_key_src == flp4->saddr &&
2540                     rt_is_output_route(rth) &&
2541                     rth->rt_oif == flp4->flowi4_oif &&
2542                     rth->rt_mark == flp4->flowi4_mark &&
2543                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2544                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2545                     net_eq(dev_net(rth->dst.dev), net) &&
2546                     !rt_is_expired(rth)) {
2547                         dst_use(&rth->dst, jiffies);
2548                         RT_CACHE_STAT_INC(out_hit);
2549                         rcu_read_unlock_bh();
2550                         if (!flp4->saddr)
2551                                 flp4->saddr = rth->rt_src;
2552                         if (!flp4->daddr)
2553                                 flp4->daddr = rth->rt_dst;
2554                         return rth;
2555                 }
2556                 RT_CACHE_STAT_INC(out_hlist_search);
2557         }
2558         rcu_read_unlock_bh();
2559
2560 slow_output:
2561         return ip_route_output_slow(net, flp4);
2562 }
2563 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2564
2565 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2566 {
2567         return NULL;
2568 }
2569
2570 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2571 {
2572         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2573
2574         return mtu ? : dst->dev->mtu;
2575 }
2576
2577 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2578 {
2579 }
2580
2581 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2582                                           unsigned long old)
2583 {
2584         return NULL;
2585 }
2586
2587 static struct dst_ops ipv4_dst_blackhole_ops = {
2588         .family                 =       AF_INET,
2589         .protocol               =       cpu_to_be16(ETH_P_IP),
2590         .destroy                =       ipv4_dst_destroy,
2591         .check                  =       ipv4_blackhole_dst_check,
2592         .mtu                    =       ipv4_blackhole_mtu,
2593         .default_advmss         =       ipv4_default_advmss,
2594         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2595         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2596         .neigh_lookup           =       ipv4_neigh_lookup,
2597 };
2598
2599 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2600 {
2601         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2602         struct rtable *ort = (struct rtable *) dst_orig;
2603
2604         if (rt) {
2605                 struct dst_entry *new = &rt->dst;
2606
2607                 new->__use = 1;
2608                 new->input = dst_discard;
2609                 new->output = dst_discard;
2610
2611                 new->dev = ort->dst.dev;
2612                 if (new->dev)
2613                         dev_hold(new->dev);
2614
2615                 rt->rt_key_dst = ort->rt_key_dst;
2616                 rt->rt_key_src = ort->rt_key_src;
2617                 rt->rt_key_tos = ort->rt_key_tos;
2618                 rt->rt_route_iif = ort->rt_route_iif;
2619                 rt->rt_iif = ort->rt_iif;
2620                 rt->rt_oif = ort->rt_oif;
2621                 rt->rt_mark = ort->rt_mark;
2622                 rt->rt_pmtu = ort->rt_pmtu;
2623
2624                 rt->rt_genid = rt_genid(net);
2625                 rt->rt_flags = ort->rt_flags;
2626                 rt->rt_type = ort->rt_type;
2627                 rt->rt_dst = ort->rt_dst;
2628                 rt->rt_src = ort->rt_src;
2629                 rt->rt_gateway = ort->rt_gateway;
2630                 rt->fi = ort->fi;
2631                 if (rt->fi)
2632                         atomic_inc(&rt->fi->fib_clntref);
2633
2634                 dst_free(new);
2635         }
2636
2637         dst_release(dst_orig);
2638
2639         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2640 }
2641
2642 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2643                                     struct sock *sk)
2644 {
2645         struct rtable *rt = __ip_route_output_key(net, flp4);
2646
2647         if (IS_ERR(rt))
2648                 return rt;
2649
2650         if (flp4->flowi4_proto)
2651                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2652                                                    flowi4_to_flowi(flp4),
2653                                                    sk, 0);
2654
2655         return rt;
2656 }
2657 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2658
2659 static int rt_fill_info(struct net *net,
2660                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2661                         int nowait, unsigned int flags)
2662 {
2663         struct rtable *rt = skb_rtable(skb);
2664         struct rtmsg *r;
2665         struct nlmsghdr *nlh;
2666         unsigned long expires = 0;
2667         u32 error;
2668
2669         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2670         if (nlh == NULL)
2671                 return -EMSGSIZE;
2672
2673         r = nlmsg_data(nlh);
2674         r->rtm_family    = AF_INET;
2675         r->rtm_dst_len  = 32;
2676         r->rtm_src_len  = 0;
2677         r->rtm_tos      = rt->rt_key_tos;
2678         r->rtm_table    = RT_TABLE_MAIN;
2679         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2680                 goto nla_put_failure;
2681         r->rtm_type     = rt->rt_type;
2682         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2683         r->rtm_protocol = RTPROT_UNSPEC;
2684         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2685         if (rt->rt_flags & RTCF_NOTIFY)
2686                 r->rtm_flags |= RTM_F_NOTIFY;
2687
2688         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2689                 goto nla_put_failure;
2690         if (rt->rt_key_src) {
2691                 r->rtm_src_len = 32;
2692                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2693                         goto nla_put_failure;
2694         }
2695         if (rt->dst.dev &&
2696             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2697                 goto nla_put_failure;
2698 #ifdef CONFIG_IP_ROUTE_CLASSID
2699         if (rt->dst.tclassid &&
2700             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2701                 goto nla_put_failure;
2702 #endif
2703         if (!rt_is_input_route(rt) &&
2704             rt->rt_src != rt->rt_key_src) {
2705                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2706                         goto nla_put_failure;
2707         }
2708         if (rt->rt_dst != rt->rt_gateway &&
2709             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2710                 goto nla_put_failure;
2711
2712         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2713                 goto nla_put_failure;
2714
2715         if (rt->rt_mark &&
2716             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2717                 goto nla_put_failure;
2718
2719         error = rt->dst.error;
2720         expires = rt->dst.expires;
2721         if (expires) {
2722                 if (time_before(jiffies, expires))
2723                         expires -= jiffies;
2724                 else
2725                         expires = 0;
2726         }
2727
2728         if (rt_is_input_route(rt)) {
2729 #ifdef CONFIG_IP_MROUTE
2730                 __be32 dst = rt->rt_dst;
2731
2732                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2733                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2734                         int err = ipmr_get_route(net, skb,
2735                                                  rt->rt_src, rt->rt_dst,
2736                                                  r, nowait);
2737                         if (err <= 0) {
2738                                 if (!nowait) {
2739                                         if (err == 0)
2740                                                 return 0;
2741                                         goto nla_put_failure;
2742                                 } else {
2743                                         if (err == -EMSGSIZE)
2744                                                 goto nla_put_failure;
2745                                         error = err;
2746                                 }
2747                         }
2748                 } else
2749 #endif
2750                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2751                                 goto nla_put_failure;
2752         }
2753
2754         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2755                 goto nla_put_failure;
2756
2757         return nlmsg_end(skb, nlh);
2758
2759 nla_put_failure:
2760         nlmsg_cancel(skb, nlh);
2761         return -EMSGSIZE;
2762 }
2763
2764 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2765 {
2766         struct net *net = sock_net(in_skb->sk);
2767         struct rtmsg *rtm;
2768         struct nlattr *tb[RTA_MAX+1];
2769         struct rtable *rt = NULL;
2770         __be32 dst = 0;
2771         __be32 src = 0;
2772         u32 iif;
2773         int err;
2774         int mark;
2775         struct sk_buff *skb;
2776
2777         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2778         if (err < 0)
2779                 goto errout;
2780
2781         rtm = nlmsg_data(nlh);
2782
2783         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2784         if (skb == NULL) {
2785                 err = -ENOBUFS;
2786                 goto errout;
2787         }
2788
2789         /* Reserve room for dummy headers, this skb can pass
2790            through good chunk of routing engine.
2791          */
2792         skb_reset_mac_header(skb);
2793         skb_reset_network_header(skb);
2794
2795         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2796         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2797         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2798
2799         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2800         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2801         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2802         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2803
2804         if (iif) {
2805                 struct net_device *dev;
2806
2807                 dev = __dev_get_by_index(net, iif);
2808                 if (dev == NULL) {
2809                         err = -ENODEV;
2810                         goto errout_free;
2811                 }
2812
2813                 skb->protocol   = htons(ETH_P_IP);
2814                 skb->dev        = dev;
2815                 skb->mark       = mark;
2816                 local_bh_disable();
2817                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2818                 local_bh_enable();
2819
2820                 rt = skb_rtable(skb);
2821                 if (err == 0 && rt->dst.error)
2822                         err = -rt->dst.error;
2823         } else {
2824                 struct flowi4 fl4 = {
2825                         .daddr = dst,
2826                         .saddr = src,
2827                         .flowi4_tos = rtm->rtm_tos,
2828                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2829                         .flowi4_mark = mark,
2830                 };
2831                 rt = ip_route_output_key(net, &fl4);
2832
2833                 err = 0;
2834                 if (IS_ERR(rt))
2835                         err = PTR_ERR(rt);
2836         }
2837
2838         if (err)
2839                 goto errout_free;
2840
2841         skb_dst_set(skb, &rt->dst);
2842         if (rtm->rtm_flags & RTM_F_NOTIFY)
2843                 rt->rt_flags |= RTCF_NOTIFY;
2844
2845         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2846                            RTM_NEWROUTE, 0, 0);
2847         if (err <= 0)
2848                 goto errout_free;
2849
2850         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2851 errout:
2852         return err;
2853
2854 errout_free:
2855         kfree_skb(skb);
2856         goto errout;
2857 }
2858
2859 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2860 {
2861         struct rtable *rt;
2862         int h, s_h;
2863         int idx, s_idx;
2864         struct net *net;
2865
2866         net = sock_net(skb->sk);
2867
2868         s_h = cb->args[0];
2869         if (s_h < 0)
2870                 s_h = 0;
2871         s_idx = idx = cb->args[1];
2872         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2873                 if (!rt_hash_table[h].chain)
2874                         continue;
2875                 rcu_read_lock_bh();
2876                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2877                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2878                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2879                                 continue;
2880                         if (rt_is_expired(rt))
2881                                 continue;
2882                         skb_dst_set_noref(skb, &rt->dst);
2883                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2884                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2885                                          1, NLM_F_MULTI) <= 0) {
2886                                 skb_dst_drop(skb);
2887                                 rcu_read_unlock_bh();
2888                                 goto done;
2889                         }
2890                         skb_dst_drop(skb);
2891                 }
2892                 rcu_read_unlock_bh();
2893         }
2894
2895 done:
2896         cb->args[0] = h;
2897         cb->args[1] = idx;
2898         return skb->len;
2899 }
2900
2901 void ip_rt_multicast_event(struct in_device *in_dev)
2902 {
2903         rt_cache_flush(dev_net(in_dev->dev), 0);
2904 }
2905
2906 #ifdef CONFIG_SYSCTL
2907 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2908                                         void __user *buffer,
2909                                         size_t *lenp, loff_t *ppos)
2910 {
2911         if (write) {
2912                 int flush_delay;
2913                 ctl_table ctl;
2914                 struct net *net;
2915
2916                 memcpy(&ctl, __ctl, sizeof(ctl));
2917                 ctl.data = &flush_delay;
2918                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2919
2920                 net = (struct net *)__ctl->extra1;
2921                 rt_cache_flush(net, flush_delay);
2922                 return 0;
2923         }
2924
2925         return -EINVAL;
2926 }
2927
2928 static ctl_table ipv4_route_table[] = {
2929         {
2930                 .procname       = "gc_thresh",
2931                 .data           = &ipv4_dst_ops.gc_thresh,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = proc_dointvec,
2935         },
2936         {
2937                 .procname       = "max_size",
2938                 .data           = &ip_rt_max_size,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = proc_dointvec,
2942         },
2943         {
2944                 /*  Deprecated. Use gc_min_interval_ms */
2945
2946                 .procname       = "gc_min_interval",
2947                 .data           = &ip_rt_gc_min_interval,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = proc_dointvec_jiffies,
2951         },
2952         {
2953                 .procname       = "gc_min_interval_ms",
2954                 .data           = &ip_rt_gc_min_interval,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = proc_dointvec_ms_jiffies,
2958         },
2959         {
2960                 .procname       = "gc_timeout",
2961                 .data           = &ip_rt_gc_timeout,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = proc_dointvec_jiffies,
2965         },
2966         {
2967                 .procname       = "gc_interval",
2968                 .data           = &ip_rt_gc_interval,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = proc_dointvec_jiffies,
2972         },
2973         {
2974                 .procname       = "redirect_load",
2975                 .data           = &ip_rt_redirect_load,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = proc_dointvec,
2979         },
2980         {
2981                 .procname       = "redirect_number",
2982                 .data           = &ip_rt_redirect_number,
2983                 .maxlen         = sizeof(int),
2984                 .mode           = 0644,
2985                 .proc_handler   = proc_dointvec,
2986         },
2987         {
2988                 .procname       = "redirect_silence",
2989                 .data           = &ip_rt_redirect_silence,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = proc_dointvec,
2993         },
2994         {
2995                 .procname       = "error_cost",
2996                 .data           = &ip_rt_error_cost,
2997                 .maxlen         = sizeof(int),
2998                 .mode           = 0644,
2999                 .proc_handler   = proc_dointvec,
3000         },
3001         {
3002                 .procname       = "error_burst",
3003                 .data           = &ip_rt_error_burst,
3004                 .maxlen         = sizeof(int),
3005                 .mode           = 0644,
3006                 .proc_handler   = proc_dointvec,
3007         },
3008         {
3009                 .procname       = "gc_elasticity",
3010                 .data           = &ip_rt_gc_elasticity,
3011                 .maxlen         = sizeof(int),
3012                 .mode           = 0644,
3013                 .proc_handler   = proc_dointvec,
3014         },
3015         {
3016                 .procname       = "mtu_expires",
3017                 .data           = &ip_rt_mtu_expires,
3018                 .maxlen         = sizeof(int),
3019                 .mode           = 0644,
3020                 .proc_handler   = proc_dointvec_jiffies,
3021         },
3022         {
3023                 .procname       = "min_pmtu",
3024                 .data           = &ip_rt_min_pmtu,
3025                 .maxlen         = sizeof(int),
3026                 .mode           = 0644,
3027                 .proc_handler   = proc_dointvec,
3028         },
3029         {
3030                 .procname       = "min_adv_mss",
3031                 .data           = &ip_rt_min_advmss,
3032                 .maxlen         = sizeof(int),
3033                 .mode           = 0644,
3034                 .proc_handler   = proc_dointvec,
3035         },
3036         { }
3037 };
3038
3039 static struct ctl_table ipv4_route_flush_table[] = {
3040         {
3041                 .procname       = "flush",
3042                 .maxlen         = sizeof(int),
3043                 .mode           = 0200,
3044                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3045         },
3046         { },
3047 };
3048
3049 static __net_init int sysctl_route_net_init(struct net *net)
3050 {
3051         struct ctl_table *tbl;
3052
3053         tbl = ipv4_route_flush_table;
3054         if (!net_eq(net, &init_net)) {
3055                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3056                 if (tbl == NULL)
3057                         goto err_dup;
3058         }
3059         tbl[0].extra1 = net;
3060
3061         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3062         if (net->ipv4.route_hdr == NULL)
3063                 goto err_reg;
3064         return 0;
3065
3066 err_reg:
3067         if (tbl != ipv4_route_flush_table)
3068                 kfree(tbl);
3069 err_dup:
3070         return -ENOMEM;
3071 }
3072
3073 static __net_exit void sysctl_route_net_exit(struct net *net)
3074 {
3075         struct ctl_table *tbl;
3076
3077         tbl = net->ipv4.route_hdr->ctl_table_arg;
3078         unregister_net_sysctl_table(net->ipv4.route_hdr);
3079         BUG_ON(tbl == ipv4_route_flush_table);
3080         kfree(tbl);
3081 }
3082
3083 static __net_initdata struct pernet_operations sysctl_route_ops = {
3084         .init = sysctl_route_net_init,
3085         .exit = sysctl_route_net_exit,
3086 };
3087 #endif
3088
3089 static __net_init int rt_genid_init(struct net *net)
3090 {
3091         get_random_bytes(&net->ipv4.rt_genid,
3092                          sizeof(net->ipv4.rt_genid));
3093         get_random_bytes(&net->ipv4.dev_addr_genid,
3094                          sizeof(net->ipv4.dev_addr_genid));
3095         return 0;
3096 }
3097
3098 static __net_initdata struct pernet_operations rt_genid_ops = {
3099         .init = rt_genid_init,
3100 };
3101
3102 static int __net_init ipv4_inetpeer_init(struct net *net)
3103 {
3104         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3105
3106         if (!bp)
3107                 return -ENOMEM;
3108         inet_peer_base_init(bp);
3109         net->ipv4.peers = bp;
3110         return 0;
3111 }
3112
3113 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3114 {
3115         struct inet_peer_base *bp = net->ipv4.peers;
3116
3117         net->ipv4.peers = NULL;
3118         inetpeer_invalidate_tree(bp);
3119         kfree(bp);
3120 }
3121
3122 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3123         .init   =       ipv4_inetpeer_init,
3124         .exit   =       ipv4_inetpeer_exit,
3125 };
3126
3127 #ifdef CONFIG_IP_ROUTE_CLASSID
3128 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3129 #endif /* CONFIG_IP_ROUTE_CLASSID */
3130
3131 static __initdata unsigned long rhash_entries;
3132 static int __init set_rhash_entries(char *str)
3133 {
3134         ssize_t ret;
3135
3136         if (!str)
3137                 return 0;
3138
3139         ret = kstrtoul(str, 0, &rhash_entries);
3140         if (ret)
3141                 return 0;
3142
3143         return 1;
3144 }
3145 __setup("rhash_entries=", set_rhash_entries);
3146
3147 int __init ip_rt_init(void)
3148 {
3149         int rc = 0;
3150
3151 #ifdef CONFIG_IP_ROUTE_CLASSID
3152         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3153         if (!ip_rt_acct)
3154                 panic("IP: failed to allocate ip_rt_acct\n");
3155 #endif
3156
3157         ipv4_dst_ops.kmem_cachep =
3158                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3159                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3160
3161         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3162
3163         if (dst_entries_init(&ipv4_dst_ops) < 0)
3164                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3165
3166         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3167                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3168
3169         rt_hash_table = (struct rt_hash_bucket *)
3170                 alloc_large_system_hash("IP route cache",
3171                                         sizeof(struct rt_hash_bucket),
3172                                         rhash_entries,
3173                                         (totalram_pages >= 128 * 1024) ?
3174                                         15 : 17,
3175                                         0,
3176                                         &rt_hash_log,
3177                                         &rt_hash_mask,
3178                                         0,
3179                                         rhash_entries ? 0 : 512 * 1024);
3180         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3181         rt_hash_lock_init();
3182
3183         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3184         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3185
3186         devinet_init();
3187         ip_fib_init();
3188
3189         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3190         expires_ljiffies = jiffies;
3191         schedule_delayed_work(&expires_work,
3192                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3193
3194         if (ip_rt_proc_init())
3195                 pr_err("Unable to create route proc files\n");
3196 #ifdef CONFIG_XFRM
3197         xfrm_init();
3198         xfrm4_init(ip_rt_max_size);
3199 #endif
3200         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3201
3202 #ifdef CONFIG_SYSCTL
3203         register_pernet_subsys(&sysctl_route_ops);
3204 #endif
3205         register_pernet_subsys(&rt_genid_ops);
3206         register_pernet_subsys(&ipv4_inetpeer_ops);
3207         return rc;
3208 }
3209
3210 #ifdef CONFIG_SYSCTL
3211 /*
3212  * We really need to sanitize the damn ipv4 init order, then all
3213  * this nonsense will go away.
3214  */
3215 void __init ip_static_sysctl_init(void)
3216 {
3217         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3218 }
3219 #endif