net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         WARN_ON(1);
 162         return NULL;
 163 }
 164
 165 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 166                                            struct sk_buff *skb,
 167                                            const void *daddr);
 168
 169 static struct dst_ops ipv4_dst_ops = {
 170         .family =               AF_INET,
 171         .protocol =             cpu_to_be16(ETH_P_IP),
 172         .gc =                   rt_garbage_collect,
 173         .check =                ipv4_dst_check,
 174         .default_advmss =       ipv4_default_advmss,
 175         .mtu =                  ipv4_mtu,
 176         .cow_metrics =          ipv4_cow_metrics,
 177         .destroy =              ipv4_dst_destroy,
 178         .ifdown =               ipv4_dst_ifdown,
 179         .negative_advice =      ipv4_negative_advice,
 180         .link_failure =         ipv4_link_failure,
 181         .update_pmtu =          ip_rt_update_pmtu,
 182         .local_out =            __ip_local_out,
 183         .neigh_lookup =         ipv4_neigh_lookup,
 184 };
 185
 186 #define ECN_OR_COST(class)      TC_PRIO_##class
 187
 188 const __u8 ip_tos2prio[16] = {
 189         TC_PRIO_BESTEFFORT,
 190         ECN_OR_COST(BESTEFFORT),
 191         TC_PRIO_BESTEFFORT,
 192         ECN_OR_COST(BESTEFFORT),
 193         TC_PRIO_BULK,
 194         ECN_OR_COST(BULK),
 195         TC_PRIO_BULK,
 196         ECN_OR_COST(BULK),
 197         TC_PRIO_INTERACTIVE,
 198         ECN_OR_COST(INTERACTIVE),
 199         TC_PRIO_INTERACTIVE,
 200         ECN_OR_COST(INTERACTIVE),
 201         TC_PRIO_INTERACTIVE_BULK,
 202         ECN_OR_COST(INTERACTIVE_BULK),
 203         TC_PRIO_INTERACTIVE_BULK,
 204         ECN_OR_COST(INTERACTIVE_BULK)
 205 };
 206 EXPORT_SYMBOL(ip_tos2prio);
 207
 208 /*
 209  * Route cache.
 210  */
 211
 212 /* The locking scheme is rather straight forward:
 213  *
 214  * 1) Read-Copy Update protects the buckets of the central route hash.
 215  * 2) Only writers remove entries, and they hold the lock
 216  *    as they look at rtable reference counts.
 217  * 3) Only readers acquire references to rtable entries,
 218  *    they do so with atomic increments and with the
 219  *    lock held.
 220  */
 221
 222 struct rt_hash_bucket {
 223         struct rtable __rcu     *chain;
 224 };
 225
 226 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 227         defined(CONFIG_PROVE_LOCKING)
 228 /*
 229  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 230  * The size of this table is a power of two and depends on the number of CPUS.
 231  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 232  */
 233 #ifdef CONFIG_LOCKDEP
 234 # define RT_HASH_LOCK_SZ        256
 235 #else
 236 # if NR_CPUS >= 32
 237 #  define RT_HASH_LOCK_SZ       4096
 238 # elif NR_CPUS >= 16
 239 #  define RT_HASH_LOCK_SZ       2048
 240 # elif NR_CPUS >= 8
 241 #  define RT_HASH_LOCK_SZ       1024
 242 # elif NR_CPUS >= 4
 243 #  define RT_HASH_LOCK_SZ       512
 244 # else
 245 #  define RT_HASH_LOCK_SZ       256
 246 # endif
 247 #endif
 248
 249 static spinlock_t       *rt_hash_locks;
 250 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 251
 252 static __init void rt_hash_lock_init(void)
 253 {
 254         int i;
 255
 256         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 257                         GFP_KERNEL);
 258         if (!rt_hash_locks)
 259                 panic("IP: failed to allocate rt_hash_locks\n");
 260
 261         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 262                 spin_lock_init(&rt_hash_locks[i]);
 263 }
 264 #else
 265 # define rt_hash_lock_addr(slot) NULL
 266
 267 static inline void rt_hash_lock_init(void)
 268 {
 269 }
 270 #endif
 271
 272 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 273 static unsigned int             rt_hash_mask __read_mostly;
 274 static unsigned int             rt_hash_log  __read_mostly;
 275
 276 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 277 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 278
 279 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 280                                    int genid)
 281 {
 282         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 283                             idx, genid)
 284                 & rt_hash_mask;
 285 }
 286
 287 static inline int rt_genid(struct net *net)
 288 {
 289         return atomic_read(&net->ipv4.rt_genid);
 290 }
 291
 292 #ifdef CONFIG_PROC_FS
 293 struct rt_cache_iter_state {
 294         struct seq_net_private p;
 295         int bucket;
 296         int genid;
 297 };
 298
 299 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 300 {
 301         struct rt_cache_iter_state *st = seq->private;
 302         struct rtable *r = NULL;
 303
 304         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 305                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 306                         continue;
 307                 rcu_read_lock_bh();
 308                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 309                 while (r) {
 310                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 311                             r->rt_genid == st->genid)
 312                                 return r;
 313                         r = rcu_dereference_bh(r->dst.rt_next);
 314                 }
 315                 rcu_read_unlock_bh();
 316         }
 317         return r;
 318 }
 319
 320 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 321                                           struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324
 325         r = rcu_dereference_bh(r->dst.rt_next);
 326         while (!r) {
 327                 rcu_read_unlock_bh();
 328                 do {
 329                         if (--st->bucket < 0)
 330                                 return NULL;
 331                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 332                 rcu_read_lock_bh();
 333                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 334         }
 335         return r;
 336 }
 337
 338 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 339                                         struct rtable *r)
 340 {
 341         struct rt_cache_iter_state *st = seq->private;
 342         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 343                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 344                         continue;
 345                 if (r->rt_genid == st->genid)
 346                         break;
 347         }
 348         return r;
 349 }
 350
 351 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 352 {
 353         struct rtable *r = rt_cache_get_first(seq);
 354
 355         if (r)
 356                 while (pos && (r = rt_cache_get_next(seq, r)))
 357                         --pos;
 358         return pos ? NULL : r;
 359 }
 360
 361 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         if (*pos)
 365                 return rt_cache_get_idx(seq, *pos - 1);
 366         st->genid = rt_genid(seq_file_net(seq));
 367         return SEQ_START_TOKEN;
 368 }
 369
 370 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 371 {
 372         struct rtable *r;
 373
 374         if (v == SEQ_START_TOKEN)
 375                 r = rt_cache_get_first(seq);
 376         else
 377                 r = rt_cache_get_next(seq, v);
 378         ++*pos;
 379         return r;
 380 }
 381
 382 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 383 {
 384         if (v && v != SEQ_START_TOKEN)
 385                 rcu_read_unlock_bh();
 386 }
 387
 388 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 389 {
 390         if (v == SEQ_START_TOKEN)
 391                 seq_printf(seq, "%-127s\n",
 392                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 393                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 394                            "HHUptod\tSpecDst");
 395         else {
 396                 struct rtable *r = v;
 397                 int len;
 398
 399                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 400                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 401                            r->dst.dev ? r->dst.dev->name : "*",
 402                            (__force u32)r->rt_dst,
 403                            (__force u32)r->rt_gateway,
 404                            r->rt_flags, atomic_read(&r->dst.__refcnt),
 405                            r->dst.__use, 0, (__force u32)r->rt_src,
 406                            dst_metric_advmss(&r->dst) + 40,
 407                            dst_metric(&r->dst, RTAX_WINDOW), 0,
 408                            r->rt_key_tos,
 409                            -1, 0, 0, &len);
 410
 411                 seq_printf(seq, "%*s\n", 127 - len, "");
 412         }
 413         return 0;
 414 }
 415
 416 static const struct seq_operations rt_cache_seq_ops = {
 417         .start  = rt_cache_seq_start,
 418         .next   = rt_cache_seq_next,
 419         .stop   = rt_cache_seq_stop,
 420         .show   = rt_cache_seq_show,
 421 };
 422
 423 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 424 {
 425         return seq_open_net(inode, file, &rt_cache_seq_ops,
 426                         sizeof(struct rt_cache_iter_state));
 427 }
 428
 429 static const struct file_operations rt_cache_seq_fops = {
 430         .owner   = THIS_MODULE,
 431         .open    = rt_cache_seq_open,
 432         .read    = seq_read,
 433         .llseek  = seq_lseek,
 434         .release = seq_release_net,
 435 };
 436
 437
 438 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 439 {
 440         int cpu;
 441
 442         if (*pos == 0)
 443                 return SEQ_START_TOKEN;
 444
 445         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452 }
 453
 454 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 455 {
 456         int cpu;
 457
 458         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 459                 if (!cpu_possible(cpu))
 460                         continue;
 461                 *pos = cpu+1;
 462                 return &per_cpu(rt_cache_stat, cpu);
 463         }
 464         return NULL;
 465
 466 }
 467
 468 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 469 {
 470
 471 }
 472
 473 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 474 {
 475         struct rt_cache_stat *st = v;
 476
 477         if (v == SEQ_START_TOKEN) {
 478                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 479                 return 0;
 480         }
 481
 482         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 483                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 484                    dst_entries_get_slow(&ipv4_dst_ops),
 485                    st->in_hit,
 486                    st->in_slow_tot,
 487                    st->in_slow_mc,
 488                    st->in_no_route,
 489                    st->in_brd,
 490                    st->in_martian_dst,
 491                    st->in_martian_src,
 492
 493                    st->out_hit,
 494                    st->out_slow_tot,
 495                    st->out_slow_mc,
 496
 497                    st->gc_total,
 498                    st->gc_ignored,
 499                    st->gc_goal_miss,
 500                    st->gc_dst_overflow,
 501                    st->in_hlist_search,
 502                    st->out_hlist_search
 503                 );
 504         return 0;
 505 }
 506
 507 static const struct seq_operations rt_cpu_seq_ops = {
 508         .start  = rt_cpu_seq_start,
 509         .next   = rt_cpu_seq_next,
 510         .stop   = rt_cpu_seq_stop,
 511         .show   = rt_cpu_seq_show,
 512 };
 513
 514
 515 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 516 {
 517         return seq_open(file, &rt_cpu_seq_ops);
 518 }
 519
 520 static const struct file_operations rt_cpu_seq_fops = {
 521         .owner   = THIS_MODULE,
 522         .open    = rt_cpu_seq_open,
 523         .read    = seq_read,
 524         .llseek  = seq_lseek,
 525         .release = seq_release,
 526 };
 527
 528 #ifdef CONFIG_IP_ROUTE_CLASSID
 529 static int rt_acct_proc_show(struct seq_file *m, void *v)
 530 {
 531         struct ip_rt_acct *dst, *src;
 532         unsigned int i, j;
 533
 534         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 535         if (!dst)
 536                 return -ENOMEM;
 537
 538         for_each_possible_cpu(i) {
 539                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 540                 for (j = 0; j < 256; j++) {
 541                         dst[j].o_bytes   += src[j].o_bytes;
 542                         dst[j].o_packets += src[j].o_packets;
 543                         dst[j].i_bytes   += src[j].i_bytes;
 544                         dst[j].i_packets += src[j].i_packets;
 545                 }
 546         }
 547
 548         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 549         kfree(dst);
 550         return 0;
 551 }
 552
 553 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 554 {
 555         return single_open(file, rt_acct_proc_show, NULL);
 556 }
 557
 558 static const struct file_operations rt_acct_proc_fops = {
 559         .owner          = THIS_MODULE,
 560         .open           = rt_acct_proc_open,
 561         .read           = seq_read,
 562         .llseek         = seq_lseek,
 563         .release        = single_release,
 564 };
 565 #endif
 566
 567 static int __net_init ip_rt_do_proc_init(struct net *net)
 568 {
 569         struct proc_dir_entry *pde;
 570
 571         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 572                         &rt_cache_seq_fops);
 573         if (!pde)
 574                 goto err1;
 575
 576         pde = proc_create("rt_cache", S_IRUGO,
 577                           net->proc_net_stat, &rt_cpu_seq_fops);
 578         if (!pde)
 579                 goto err2;
 580
 581 #ifdef CONFIG_IP_ROUTE_CLASSID
 582         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 583         if (!pde)
 584                 goto err3;
 585 #endif
 586         return 0;
 587
 588 #ifdef CONFIG_IP_ROUTE_CLASSID
 589 err3:
 590         remove_proc_entry("rt_cache", net->proc_net_stat);
 591 #endif
 592 err2:
 593         remove_proc_entry("rt_cache", net->proc_net);
 594 err1:
 595         return -ENOMEM;
 596 }
 597
 598 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 599 {
 600         remove_proc_entry("rt_cache", net->proc_net_stat);
 601         remove_proc_entry("rt_cache", net->proc_net);
 602 #ifdef CONFIG_IP_ROUTE_CLASSID
 603         remove_proc_entry("rt_acct", net->proc_net);
 604 #endif
 605 }
 606
 607 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 608         .init = ip_rt_do_proc_init,
 609         .exit = ip_rt_do_proc_exit,
 610 };
 611
 612 static int __init ip_rt_proc_init(void)
 613 {
 614         return register_pernet_subsys(&ip_rt_proc_ops);
 615 }
 616
 617 #else
 618 static inline int ip_rt_proc_init(void)
 619 {
 620         return 0;
 621 }
 622 #endif /* CONFIG_PROC_FS */
 623
 624 static inline void rt_free(struct rtable *rt)
 625 {
 626         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 627 }
 628
 629 static inline void rt_drop(struct rtable *rt)
 630 {
 631         ip_rt_put(rt);
 632         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 633 }
 634
 635 static inline int rt_fast_clean(struct rtable *rth)
 636 {
 637         /* Kill broadcast/multicast entries very aggresively, if they
 638            collide in hash table with more useful entries */
 639         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 640                 rt_is_input_route(rth) && rth->dst.rt_next;
 641 }
 642
 643 static inline int rt_valuable(struct rtable *rth)
 644 {
 645         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 646                 rth->dst.expires;
 647 }
 648
 649 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 650 {
 651         unsigned long age;
 652         int ret = 0;
 653
 654         if (atomic_read(&rth->dst.__refcnt))
 655                 goto out;
 656
 657         age = jiffies - rth->dst.lastuse;
 658         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 659             (age <= tmo2 && rt_valuable(rth)))
 660                 goto out;
 661         ret = 1;
 662 out:    return ret;
 663 }
 664
 665 /* Bits of score are:
 666  * 31: very valuable
 667  * 30: not quite useless
 668  * 29..0: usage counter
 669  */
 670 static inline u32 rt_score(struct rtable *rt)
 671 {
 672         u32 score = jiffies - rt->dst.lastuse;
 673
 674         score = ~score & ~(3<<30);
 675
 676         if (rt_valuable(rt))
 677                 score |= (1<<31);
 678
 679         if (rt_is_output_route(rt) ||
 680             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 681                 score |= (1<<30);
 682
 683         return score;
 684 }
 685
 686 static inline bool rt_caching(const struct net *net)
 687 {
 688         return net->ipv4.current_rt_cache_rebuild_count <=
 689                 net->ipv4.sysctl_rt_cache_rebuild_count;
 690 }
 691
 692 static inline bool compare_hash_inputs(const struct rtable *rt1,
 693                                        const struct rtable *rt2)
 694 {
 695         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 696                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 697                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 698 }
 699
 700 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 701 {
 702         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 703                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 704                 (rt1->rt_mark ^ rt2->rt_mark) |
 705                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 706                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 707                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 708 }
 709
 710 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 711 {
 712         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 713 }
 714
 715 static inline int rt_is_expired(struct rtable *rth)
 716 {
 717         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 718 }
 719
 720 /*
 721  * Perform a full scan of hash table and free all entries.
 722  * Can be called by a softirq or a process.
 723  * In the later case, we want to be reschedule if necessary
 724  */
 725 static void rt_do_flush(struct net *net, int process_context)
 726 {
 727         unsigned int i;
 728         struct rtable *rth, *next;
 729
 730         for (i = 0; i <= rt_hash_mask; i++) {
 731                 struct rtable __rcu **pprev;
 732                 struct rtable *list;
 733
 734                 if (process_context && need_resched())
 735                         cond_resched();
 736                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 737                 if (!rth)
 738                         continue;
 739
 740                 spin_lock_bh(rt_hash_lock_addr(i));
 741
 742                 list = NULL;
 743                 pprev = &rt_hash_table[i].chain;
 744                 rth = rcu_dereference_protected(*pprev,
 745                         lockdep_is_held(rt_hash_lock_addr(i)));
 746
 747                 while (rth) {
 748                         next = rcu_dereference_protected(rth->dst.rt_next,
 749                                 lockdep_is_held(rt_hash_lock_addr(i)));
 750
 751                         if (!net ||
 752                             net_eq(dev_net(rth->dst.dev), net)) {
 753                                 rcu_assign_pointer(*pprev, next);
 754                                 rcu_assign_pointer(rth->dst.rt_next, list);
 755                                 list = rth;
 756                         } else {
 757                                 pprev = &rth->dst.rt_next;
 758                         }
 759                         rth = next;
 760                 }
 761
 762                 spin_unlock_bh(rt_hash_lock_addr(i));
 763
 764                 for (; list; list = next) {
 765                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 766                         rt_free(list);
 767                 }
 768         }
 769 }
 770
 771 /*
 772  * While freeing expired entries, we compute average chain length
 773  * and standard deviation, using fixed-point arithmetic.
 774  * This to have an estimation of rt_chain_length_max
 775  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 776  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 777  */
 778
 779 #define FRACT_BITS 3
 780 #define ONE (1UL << FRACT_BITS)
 781
 782 /*
 783  * Given a hash chain and an item in this hash chain,
 784  * find if a previous entry has the same hash_inputs
 785  * (but differs on tos, mark or oif)
 786  * Returns 0 if an alias is found.
 787  * Returns ONE if rth has no alias before itself.
 788  */
 789 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 790 {
 791         const struct rtable *aux = head;
 792
 793         while (aux != rth) {
 794                 if (compare_hash_inputs(aux, rth))
 795                         return 0;
 796                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 797         }
 798         return ONE;
 799 }
 800
 801 static void rt_check_expire(void)
 802 {
 803         static unsigned int rover;
 804         unsigned int i = rover, goal;
 805         struct rtable *rth;
 806         struct rtable __rcu **rthp;
 807         unsigned long samples = 0;
 808         unsigned long sum = 0, sum2 = 0;
 809         unsigned long delta;
 810         u64 mult;
 811
 812         delta = jiffies - expires_ljiffies;
 813         expires_ljiffies = jiffies;
 814         mult = ((u64)delta) << rt_hash_log;
 815         if (ip_rt_gc_timeout > 1)
 816                 do_div(mult, ip_rt_gc_timeout);
 817         goal = (unsigned int)mult;
 818         if (goal > rt_hash_mask)
 819                 goal = rt_hash_mask + 1;
 820         for (; goal > 0; goal--) {
 821                 unsigned long tmo = ip_rt_gc_timeout;
 822                 unsigned long length;
 823
 824                 i = (i + 1) & rt_hash_mask;
 825                 rthp = &rt_hash_table[i].chain;
 826
 827                 if (need_resched())
 828                         cond_resched();
 829
 830                 samples++;
 831
 832                 if (rcu_dereference_raw(*rthp) == NULL)
 833                         continue;
 834                 length = 0;
 835                 spin_lock_bh(rt_hash_lock_addr(i));
 836                 while ((rth = rcu_dereference_protected(*rthp,
 837                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 838                         prefetch(rth->dst.rt_next);
 839                         if (rt_is_expired(rth) ||
 840                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 841                                 *rthp = rth->dst.rt_next;
 842                                 rt_free(rth);
 843                                 continue;
 844                         }
 845
 846                         /* We only count entries on a chain with equal
 847                          * hash inputs once so that entries for
 848                          * different QOS levels, and other non-hash
 849                          * input attributes don't unfairly skew the
 850                          * length computation
 851                          */
 852                         tmo >>= 1;
 853                         rthp = &rth->dst.rt_next;
 854                         length += has_noalias(rt_hash_table[i].chain, rth);
 855                 }
 856                 spin_unlock_bh(rt_hash_lock_addr(i));
 857                 sum += length;
 858                 sum2 += length*length;
 859         }
 860         if (samples) {
 861                 unsigned long avg = sum / samples;
 862                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 863                 rt_chain_length_max = max_t(unsigned long,
 864                                         ip_rt_gc_elasticity,
 865                                         (avg + 4*sd) >> FRACT_BITS);
 866         }
 867         rover = i;
 868 }
 869
 870 /*
 871  * rt_worker_func() is run in process context.
 872  * we call rt_check_expire() to scan part of the hash table
 873  */
 874 static void rt_worker_func(struct work_struct *work)
 875 {
 876         rt_check_expire();
 877         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 878 }
 879
 880 /*
 881  * Perturbation of rt_genid by a small quantity [1..256]
 882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 883  * many times (2^24) without giving recent rt_genid.
 884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 885  */
 886 static void rt_cache_invalidate(struct net *net)
 887 {
 888         unsigned char shuffle;
 889
 890         get_random_bytes(&shuffle, sizeof(shuffle));
 891         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 892 }
 893
 894 /*
 895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 896  * delay >= 0 : invalidate & flush cache (can be long)
 897  */
 898 void rt_cache_flush(struct net *net, int delay)
 899 {
 900         rt_cache_invalidate(net);
 901         if (delay >= 0)
 902                 rt_do_flush(net, !in_softirq());
 903 }
 904
 905 /* Flush previous cache invalidated entries from the cache */
 906 void rt_cache_flush_batch(struct net *net)
 907 {
 908         rt_do_flush(net, !in_softirq());
 909 }
 910
 911 static void rt_emergency_hash_rebuild(struct net *net)
 912 {
 913         net_warn_ratelimited("Route hash chain too long!\n");
 914         rt_cache_invalidate(net);
 915 }
 916
 917 /*
 918    Short description of GC goals.
 919
 920    We want to build algorithm, which will keep routing cache
 921    at some equilibrium point, when number of aged off entries
 922    is kept approximately equal to newly generated ones.
 923
 924    Current expiration strength is variable "expire".
 925    We try to adjust it dynamically, so that if networking
 926    is idle expires is large enough to keep enough of warm entries,
 927    and when load increases it reduces to limit cache size.
 928  */
 929
 930 static int rt_garbage_collect(struct dst_ops *ops)
 931 {
 932         static unsigned long expire = RT_GC_TIMEOUT;
 933         static unsigned long last_gc;
 934         static int rover;
 935         static int equilibrium;
 936         struct rtable *rth;
 937         struct rtable __rcu **rthp;
 938         unsigned long now = jiffies;
 939         int goal;
 940         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 941
 942         /*
 943          * Garbage collection is pretty expensive,
 944          * do not make it too frequently.
 945          */
 946
 947         RT_CACHE_STAT_INC(gc_total);
 948
 949         if (now - last_gc < ip_rt_gc_min_interval &&
 950             entries < ip_rt_max_size) {
 951                 RT_CACHE_STAT_INC(gc_ignored);
 952                 goto out;
 953         }
 954
 955         entries = dst_entries_get_slow(&ipv4_dst_ops);
 956         /* Calculate number of entries, which we want to expire now. */
 957         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 958         if (goal <= 0) {
 959                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 960                         equilibrium = ipv4_dst_ops.gc_thresh;
 961                 goal = entries - equilibrium;
 962                 if (goal > 0) {
 963                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 964                         goal = entries - equilibrium;
 965                 }
 966         } else {
 967                 /* We are in dangerous area. Try to reduce cache really
 968                  * aggressively.
 969                  */
 970                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 971                 equilibrium = entries - goal;
 972         }
 973
 974         if (now - last_gc >= ip_rt_gc_min_interval)
 975                 last_gc = now;
 976
 977         if (goal <= 0) {
 978                 equilibrium += goal;
 979                 goto work_done;
 980         }
 981
 982         do {
 983                 int i, k;
 984
 985                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 986                         unsigned long tmo = expire;
 987
 988                         k = (k + 1) & rt_hash_mask;
 989                         rthp = &rt_hash_table[k].chain;
 990                         spin_lock_bh(rt_hash_lock_addr(k));
 991                         while ((rth = rcu_dereference_protected(*rthp,
 992                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 993                                 if (!rt_is_expired(rth) &&
 994                                         !rt_may_expire(rth, tmo, expire)) {
 995                                         tmo >>= 1;
 996                                         rthp = &rth->dst.rt_next;
 997                                         continue;
 998                                 }
 999                                 *rthp = rth->dst.rt_next;
1000                                 rt_free(rth);
1001                                 goal--;
1002                         }
1003                         spin_unlock_bh(rt_hash_lock_addr(k));
1004                         if (goal <= 0)
1005                                 break;
1006                 }
1007                 rover = k;
1008
1009                 if (goal <= 0)
1010                         goto work_done;
1011
1012                 /* Goal is not achieved. We stop process if:
1013
1014                    - if expire reduced to zero. Otherwise, expire is halfed.
1015                    - if table is not full.
1016                    - if we are called from interrupt.
1017                    - jiffies check is just fallback/debug loop breaker.
1018                      We will not spin here for long time in any case.
1019                  */
1020
1021                 RT_CACHE_STAT_INC(gc_goal_miss);
1022
1023                 if (expire == 0)
1024                         break;
1025
1026                 expire >>= 1;
1027
1028                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1029                         goto out;
1030         } while (!in_softirq() && time_before_eq(jiffies, now));
1031
1032         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                 goto out;
1034         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1035                 goto out;
1036         net_warn_ratelimited("dst cache overflow\n");
1037         RT_CACHE_STAT_INC(gc_dst_overflow);
1038         return 1;
1039
1040 work_done:
1041         expire += ip_rt_gc_min_interval;
1042         if (expire > ip_rt_gc_timeout ||
1043             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1044             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1045                 expire = ip_rt_gc_timeout;
1046 out:    return 0;
1047 }
1048
1049 /*
1050  * Returns number of entries in a hash chain that have different hash_inputs
1051  */
1052 static int slow_chain_length(const struct rtable *head)
1053 {
1054         int length = 0;
1055         const struct rtable *rth = head;
1056
1057         while (rth) {
1058                 length += has_noalias(head, rth);
1059                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1060         }
1061         return length >> FRACT_BITS;
1062 }
1063
1064 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1065                                            struct sk_buff *skb,
1066                                            const void *daddr)
1067 {
1068         struct net_device *dev = dst->dev;
1069         const __be32 *pkey = daddr;
1070         const struct rtable *rt;
1071         struct neighbour *n;
1072
1073         rt = (const struct rtable *) dst;
1074         if (rt->rt_gateway)
1075                 pkey = (const __be32 *) &rt->rt_gateway;
1076         else if (skb)
1077                 pkey = &ip_hdr(skb)->daddr;
1078
1079         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1080         if (n)
1081                 return n;
1082         return neigh_create(&arp_tbl, pkey, dev);
1083 }
1084
1085 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1086                                      struct sk_buff *skb, int ifindex)
1087 {
1088         struct rtable   *rth, *cand;
1089         struct rtable __rcu **rthp, **candp;
1090         unsigned long   now;
1091         u32             min_score;
1092         int             chain_length;
1093
1094 restart:
1095         chain_length = 0;
1096         min_score = ~(u32)0;
1097         cand = NULL;
1098         candp = NULL;
1099         now = jiffies;
1100
1101         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1102                 /*
1103                  * If we're not caching, just tell the caller we
1104                  * were successful and don't touch the route.  The
1105                  * caller hold the sole reference to the cache entry, and
1106                  * it will be released when the caller is done with it.
1107                  * If we drop it here, the callers have no way to resolve routes
1108                  * when we're not caching.  Instead, just point *rp at rt, so
1109                  * the caller gets a single use out of the route
1110                  * Note that we do rt_free on this new route entry, so that
1111                  * once its refcount hits zero, we are still able to reap it
1112                  * (Thanks Alexey)
1113                  * Note: To avoid expensive rcu stuff for this uncached dst,
1114                  * we set DST_NOCACHE so that dst_release() can free dst without
1115                  * waiting a grace period.
1116                  */
1117
1118                 rt->dst.flags |= DST_NOCACHE;
1119                 goto skip_hashing;
1120         }
1121
1122         rthp = &rt_hash_table[hash].chain;
1123
1124         spin_lock_bh(rt_hash_lock_addr(hash));
1125         while ((rth = rcu_dereference_protected(*rthp,
1126                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1127                 if (rt_is_expired(rth)) {
1128                         *rthp = rth->dst.rt_next;
1129                         rt_free(rth);
1130                         continue;
1131                 }
1132                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1133                         /* Put it first */
1134                         *rthp = rth->dst.rt_next;
1135                         /*
1136                          * Since lookup is lockfree, the deletion
1137                          * must be visible to another weakly ordered CPU before
1138                          * the insertion at the start of the hash chain.
1139                          */
1140                         rcu_assign_pointer(rth->dst.rt_next,
1141                                            rt_hash_table[hash].chain);
1142                         /*
1143                          * Since lookup is lockfree, the update writes
1144                          * must be ordered for consistency on SMP.
1145                          */
1146                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147
1148                         dst_use(&rth->dst, now);
1149                         spin_unlock_bh(rt_hash_lock_addr(hash));
1150
1151                         rt_drop(rt);
1152                         if (skb)
1153                                 skb_dst_set(skb, &rth->dst);
1154                         return rth;
1155                 }
1156
1157                 if (!atomic_read(&rth->dst.__refcnt)) {
1158                         u32 score = rt_score(rth);
1159
1160                         if (score <= min_score) {
1161                                 cand = rth;
1162                                 candp = rthp;
1163                                 min_score = score;
1164                         }
1165                 }
1166
1167                 chain_length++;
1168
1169                 rthp = &rth->dst.rt_next;
1170         }
1171
1172         if (cand) {
1173                 /* ip_rt_gc_elasticity used to be average length of chain
1174                  * length, when exceeded gc becomes really aggressive.
1175                  *
1176                  * The second limit is less certain. At the moment it allows
1177                  * only 2 entries per bucket. We will see.
1178                  */
1179                 if (chain_length > ip_rt_gc_elasticity) {
1180                         *candp = cand->dst.rt_next;
1181                         rt_free(cand);
1182                 }
1183         } else {
1184                 if (chain_length > rt_chain_length_max &&
1185                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1186                         struct net *net = dev_net(rt->dst.dev);
1187                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1188                         if (!rt_caching(net)) {
1189                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1190                                         rt->dst.dev->name, num);
1191                         }
1192                         rt_emergency_hash_rebuild(net);
1193                         spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1196                                         ifindex, rt_genid(net));
1197                         goto restart;
1198                 }
1199         }
1200
1201         rt->dst.rt_next = rt_hash_table[hash].chain;
1202
1203         /*
1204          * Since lookup is lockfree, we must make sure
1205          * previous writes to rt are committed to memory
1206          * before making rt visible to other CPUS.
1207          */
1208         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1209
1210         spin_unlock_bh(rt_hash_lock_addr(hash));
1211
1212 skip_hashing:
1213         if (skb)
1214                 skb_dst_set(skb, &rt->dst);
1215         return rt;
1216 }
1217
1218 /*
1219  * Peer allocation may fail only in serious out-of-memory conditions.  However
1220  * we still can generate some output.
1221  * Random ID selection looks a bit dangerous because we have no chances to
1222  * select ID being unique in a reasonable period of time.
1223  * But broken packet identifier may be better than no packet at all.
1224  */
1225 static void ip_select_fb_ident(struct iphdr *iph)
1226 {
1227         static DEFINE_SPINLOCK(ip_fb_id_lock);
1228         static u32 ip_fallback_id;
1229         u32 salt;
1230
1231         spin_lock_bh(&ip_fb_id_lock);
1232         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1233         iph->id = htons(salt & 0xFFFF);
1234         ip_fallback_id = salt;
1235         spin_unlock_bh(&ip_fb_id_lock);
1236 }
1237
1238 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1239 {
1240         struct net *net = dev_net(dst->dev);
1241         struct inet_peer *peer;
1242
1243         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1244         if (peer) {
1245                 iph->id = htons(inet_getid(peer, more));
1246                 inet_putpeer(peer);
1247                 return;
1248         }
1249
1250         ip_select_fb_ident(iph);
1251 }
1252 EXPORT_SYMBOL(__ip_select_ident);
1253
1254 static void rt_del(unsigned int hash, struct rtable *rt)
1255 {
1256         struct rtable __rcu **rthp;
1257         struct rtable *aux;
1258
1259         rthp = &rt_hash_table[hash].chain;
1260         spin_lock_bh(rt_hash_lock_addr(hash));
1261         ip_rt_put(rt);
1262         while ((aux = rcu_dereference_protected(*rthp,
1263                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1264                 if (aux == rt || rt_is_expired(aux)) {
1265                         *rthp = aux->dst.rt_next;
1266                         rt_free(aux);
1267                         continue;
1268                 }
1269                 rthp = &aux->dst.rt_next;
1270         }
1271         spin_unlock_bh(rt_hash_lock_addr(hash));
1272 }
1273
1274 /* called in rcu_read_lock() section */
1275 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1276                     __be32 saddr, struct net_device *dev)
1277 {
1278         int s, i;
1279         struct in_device *in_dev = __in_dev_get_rcu(dev);
1280         __be32 skeys[2] = { saddr, 0 };
1281         int    ikeys[2] = { dev->ifindex, 0 };
1282         struct net *net;
1283
1284         if (!in_dev)
1285                 return;
1286
1287         net = dev_net(dev);
1288         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1289             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1290             ipv4_is_zeronet(new_gw))
1291                 goto reject_redirect;
1292
1293         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1294                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1295                         goto reject_redirect;
1296                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1297                         goto reject_redirect;
1298         } else {
1299                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1300                         goto reject_redirect;
1301         }
1302
1303         for (s = 0; s < 2; s++) {
1304                 for (i = 0; i < 2; i++) {
1305                         unsigned int hash;
1306                         struct rtable __rcu **rthp;
1307                         struct rtable *rt;
1308
1309                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1310
1311                         rthp = &rt_hash_table[hash].chain;
1312
1313                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1314                                 struct neighbour *n;
1315
1316                                 rthp = &rt->dst.rt_next;
1317
1318                                 if (rt->rt_key_dst != daddr ||
1319                                     rt->rt_key_src != skeys[s] ||
1320                                     rt->rt_oif != ikeys[i] ||
1321                                     rt_is_input_route(rt) ||
1322                                     rt_is_expired(rt) ||
1323                                     !net_eq(dev_net(rt->dst.dev), net) ||
1324                                     rt->dst.error ||
1325                                     rt->dst.dev != dev ||
1326                                     rt->rt_gateway != old_gw)
1327                                         continue;
1328
1329                                 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1330                                 if (n) {
1331                                         if (!(n->nud_state & NUD_VALID)) {
1332                                                 neigh_event_send(n, NULL);
1333                                         } else {
1334                                                 rt->rt_gateway = new_gw;
1335                                                 rt->rt_flags |= RTCF_REDIRECTED;
1336                                                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1337                                         }
1338                                         neigh_release(n);
1339                                 }
1340                         }
1341                 }
1342         }
1343         return;
1344
1345 reject_redirect:
1346 #ifdef CONFIG_IP_ROUTE_VERBOSE
1347         if (IN_DEV_LOG_MARTIANS(in_dev))
1348                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1349                                      "  Advised path = %pI4 -> %pI4\n",
1350                                      &old_gw, dev->name, &new_gw,
1351                                      &saddr, &daddr);
1352 #endif
1353         ;
1354 }
1355
1356 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1357 {
1358         struct rtable *rt = (struct rtable *)dst;
1359         struct dst_entry *ret = dst;
1360
1361         if (rt) {
1362                 if (dst->obsolete > 0) {
1363                         ip_rt_put(rt);
1364                         ret = NULL;
1365                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1366                            rt->dst.expires) {
1367                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1368                                                 rt->rt_oif,
1369                                                 rt_genid(dev_net(dst->dev)));
1370                         rt_del(hash, rt);
1371                         ret = NULL;
1372                 }
1373         }
1374         return ret;
1375 }
1376
1377 /*
1378  * Algorithm:
1379  *      1. The first ip_rt_redirect_number redirects are sent
1380  *         with exponential backoff, then we stop sending them at all,
1381  *         assuming that the host ignores our redirects.
1382  *      2. If we did not see packets requiring redirects
1383  *         during ip_rt_redirect_silence, we assume that the host
1384  *         forgot redirected route and start to send redirects again.
1385  *
1386  * This algorithm is much cheaper and more intelligent than dumb load limiting
1387  * in icmp.c.
1388  *
1389  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1390  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1391  */
1392
1393 void ip_rt_send_redirect(struct sk_buff *skb)
1394 {
1395         struct rtable *rt = skb_rtable(skb);
1396         struct in_device *in_dev;
1397         struct inet_peer *peer;
1398         struct net *net;
1399         int log_martians;
1400
1401         rcu_read_lock();
1402         in_dev = __in_dev_get_rcu(rt->dst.dev);
1403         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1404                 rcu_read_unlock();
1405                 return;
1406         }
1407         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1408         rcu_read_unlock();
1409
1410         net = dev_net(rt->dst.dev);
1411         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1412         if (!peer) {
1413                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1414                 return;
1415         }
1416
1417         /* No redirected packets during ip_rt_redirect_silence;
1418          * reset the algorithm.
1419          */
1420         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1421                 peer->rate_tokens = 0;
1422
1423         /* Too many ignored redirects; do not send anything
1424          * set dst.rate_last to the last seen redirected packet.
1425          */
1426         if (peer->rate_tokens >= ip_rt_redirect_number) {
1427                 peer->rate_last = jiffies;
1428                 goto out_put_peer;
1429         }
1430
1431         /* Check for load limit; set rate_last to the latest sent
1432          * redirect.
1433          */
1434         if (peer->rate_tokens == 0 ||
1435             time_after(jiffies,
1436                        (peer->rate_last +
1437                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1438                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1439                 peer->rate_last = jiffies;
1440                 ++peer->rate_tokens;
1441 #ifdef CONFIG_IP_ROUTE_VERBOSE
1442                 if (log_martians &&
1443                     peer->rate_tokens == ip_rt_redirect_number)
1444                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1445                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1446                                              &rt->rt_dst, &rt->rt_gateway);
1447 #endif
1448         }
1449 out_put_peer:
1450         inet_putpeer(peer);
1451 }
1452
1453 static int ip_error(struct sk_buff *skb)
1454 {
1455         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1456         struct rtable *rt = skb_rtable(skb);
1457         struct inet_peer *peer;
1458         unsigned long now;
1459         struct net *net;
1460         bool send;
1461         int code;
1462
1463         net = dev_net(rt->dst.dev);
1464         if (!IN_DEV_FORWARD(in_dev)) {
1465                 switch (rt->dst.error) {
1466                 case EHOSTUNREACH:
1467                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1468                         break;
1469
1470                 case ENETUNREACH:
1471                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1472                         break;
1473                 }
1474                 goto out;
1475         }
1476
1477         switch (rt->dst.error) {
1478         case EINVAL:
1479         default:
1480                 goto out;
1481         case EHOSTUNREACH:
1482                 code = ICMP_HOST_UNREACH;
1483                 break;
1484         case ENETUNREACH:
1485                 code = ICMP_NET_UNREACH;
1486                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1487                 break;
1488         case EACCES:
1489                 code = ICMP_PKT_FILTERED;
1490                 break;
1491         }
1492
1493         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1494
1495         send = true;
1496         if (peer) {
1497                 now = jiffies;
1498                 peer->rate_tokens += now - peer->rate_last;
1499                 if (peer->rate_tokens > ip_rt_error_burst)
1500                         peer->rate_tokens = ip_rt_error_burst;
1501                 peer->rate_last = now;
1502                 if (peer->rate_tokens >= ip_rt_error_cost)
1503                         peer->rate_tokens -= ip_rt_error_cost;
1504                 else
1505                         send = false;
1506                 inet_putpeer(peer);
1507         }
1508         if (send)
1509                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1510
1511 out:    kfree_skb(skb);
1512         return 0;
1513 }
1514
1515 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1516 {
1517         struct rtable *rt = (struct rtable *) dst;
1518
1519         dst_confirm(dst);
1520
1521         if (mtu < ip_rt_min_pmtu)
1522                 mtu = ip_rt_min_pmtu;
1523
1524         rt->rt_pmtu = mtu;
1525         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1526 }
1527
1528 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1529                       int oif, u32 mark, u8 protocol, int flow_flags)
1530 {
1531         const struct iphdr *iph = (const struct iphdr *)skb->data;
1532         struct flowi4 fl4;
1533         struct rtable *rt;
1534
1535         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1536                            protocol, flow_flags,
1537                            iph->daddr, iph->saddr, 0, 0);
1538         rt = __ip_route_output_key(net, &fl4);
1539         if (!IS_ERR(rt)) {
1540                 ip_rt_update_pmtu(&rt->dst, mtu);
1541                 ip_rt_put(rt);
1542         }
1543 }
1544 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1545
1546 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1547 {
1548         const struct inet_sock *inet = inet_sk(sk);
1549
1550         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1551                                 sk->sk_bound_dev_if, sk->sk_mark,
1552                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1553                                 inet_sk_flowi_flags(sk));
1554 }
1555 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1556
1557 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1558 {
1559         struct rtable *rt = (struct rtable *) dst;
1560
1561         if (rt_is_expired(rt))
1562                 return NULL;
1563         return dst;
1564 }
1565
1566 static void ipv4_dst_destroy(struct dst_entry *dst)
1567 {
1568         struct rtable *rt = (struct rtable *) dst;
1569
1570         if (rt->fi) {
1571                 fib_info_put(rt->fi);
1572                 rt->fi = NULL;
1573         }
1574 }
1575
1576
1577 static void ipv4_link_failure(struct sk_buff *skb)
1578 {
1579         struct rtable *rt;
1580
1581         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1582
1583         rt = skb_rtable(skb);
1584         if (rt)
1585                 dst_set_expires(&rt->dst, 0);
1586 }
1587
1588 static int ip_rt_bug(struct sk_buff *skb)
1589 {
1590         pr_debug("%s: %pI4 -> %pI4, %s\n",
1591                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1592                  skb->dev ? skb->dev->name : "?");
1593         kfree_skb(skb);
1594         WARN_ON(1);
1595         return 0;
1596 }
1597
1598 /*
1599    We do not cache source address of outgoing interface,
1600    because it is used only by IP RR, TS and SRR options,
1601    so that it out of fast path.
1602
1603    BTW remember: "addr" is allowed to be not aligned
1604    in IP options!
1605  */
1606
1607 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1608 {
1609         __be32 src;
1610
1611         if (rt_is_output_route(rt))
1612                 src = ip_hdr(skb)->saddr;
1613         else {
1614                 struct fib_result res;
1615                 struct flowi4 fl4;
1616                 struct iphdr *iph;
1617
1618                 iph = ip_hdr(skb);
1619
1620                 memset(&fl4, 0, sizeof(fl4));
1621                 fl4.daddr = iph->daddr;
1622                 fl4.saddr = iph->saddr;
1623                 fl4.flowi4_tos = RT_TOS(iph->tos);
1624                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1625                 fl4.flowi4_iif = skb->dev->ifindex;
1626                 fl4.flowi4_mark = skb->mark;
1627
1628                 rcu_read_lock();
1629                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1630                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1631                 else
1632                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1633                                         RT_SCOPE_UNIVERSE);
1634                 rcu_read_unlock();
1635         }
1636         memcpy(addr, &src, 4);
1637 }
1638
1639 #ifdef CONFIG_IP_ROUTE_CLASSID
1640 static void set_class_tag(struct rtable *rt, u32 tag)
1641 {
1642         if (!(rt->dst.tclassid & 0xFFFF))
1643                 rt->dst.tclassid |= tag & 0xFFFF;
1644         if (!(rt->dst.tclassid & 0xFFFF0000))
1645                 rt->dst.tclassid |= tag & 0xFFFF0000;
1646 }
1647 #endif
1648
1649 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1650 {
1651         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1652
1653         if (advmss == 0) {
1654                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1655                                ip_rt_min_advmss);
1656                 if (advmss > 65535 - 40)
1657                         advmss = 65535 - 40;
1658         }
1659         return advmss;
1660 }
1661
1662 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1663 {
1664         const struct rtable *rt = (const struct rtable *) dst;
1665         unsigned int mtu = rt->rt_pmtu;
1666
1667         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1668                 mtu = 0;
1669
1670         if (!mtu)
1671                 mtu = dst_metric_raw(dst, RTAX_MTU);
1672
1673         if (mtu && rt_is_output_route(rt))
1674                 return mtu;
1675
1676         mtu = dst->dev->mtu;
1677
1678         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1679
1680                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1681                         mtu = 576;
1682         }
1683
1684         if (mtu > IP_MAX_MTU)
1685                 mtu = IP_MAX_MTU;
1686
1687         return mtu;
1688 }
1689
1690 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1691                             struct fib_info *fi)
1692 {
1693         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1694                 rt->fi = fi;
1695                 atomic_inc(&fi->fib_clntref);
1696         }
1697         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1698 }
1699
1700 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1701                            const struct fib_result *res,
1702                            struct fib_info *fi, u16 type, u32 itag)
1703 {
1704         if (fi) {
1705                 if (FIB_RES_GW(*res) &&
1706                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1707                         rt->rt_gateway = FIB_RES_GW(*res);
1708                 rt_init_metrics(rt, fl4, fi);
1709 #ifdef CONFIG_IP_ROUTE_CLASSID
1710                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1711 #endif
1712         }
1713
1714 #ifdef CONFIG_IP_ROUTE_CLASSID
1715 #ifdef CONFIG_IP_MULTIPLE_TABLES
1716         set_class_tag(rt, fib_rules_tclass(res));
1717 #endif
1718         set_class_tag(rt, itag);
1719 #endif
1720 }
1721
1722 static struct rtable *rt_dst_alloc(struct net_device *dev,
1723                                    bool nopolicy, bool noxfrm)
1724 {
1725         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1726                          DST_HOST |
1727                          (nopolicy ? DST_NOPOLICY : 0) |
1728                          (noxfrm ? DST_NOXFRM : 0));
1729 }
1730
1731 /* called in rcu_read_lock() section */
1732 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1733                                 u8 tos, struct net_device *dev, int our)
1734 {
1735         unsigned int hash;
1736         struct rtable *rth;
1737         struct in_device *in_dev = __in_dev_get_rcu(dev);
1738         u32 itag = 0;
1739         int err;
1740
1741         /* Primary sanity checks. */
1742
1743         if (in_dev == NULL)
1744                 return -EINVAL;
1745
1746         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1747             skb->protocol != htons(ETH_P_IP))
1748                 goto e_inval;
1749
1750         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1751                 if (ipv4_is_loopback(saddr))
1752                         goto e_inval;
1753
1754         if (ipv4_is_zeronet(saddr)) {
1755                 if (!ipv4_is_local_multicast(daddr))
1756                         goto e_inval;
1757         } else {
1758                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1759                                           in_dev, &itag);
1760                 if (err < 0)
1761                         goto e_err;
1762         }
1763         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1764                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1765         if (!rth)
1766                 goto e_nobufs;
1767
1768 #ifdef CONFIG_IP_ROUTE_CLASSID
1769         rth->dst.tclassid = itag;
1770 #endif
1771         rth->dst.output = ip_rt_bug;
1772
1773         rth->rt_key_dst = daddr;
1774         rth->rt_key_src = saddr;
1775         rth->rt_genid   = rt_genid(dev_net(dev));
1776         rth->rt_flags   = RTCF_MULTICAST;
1777         rth->rt_type    = RTN_MULTICAST;
1778         rth->rt_key_tos = tos;
1779         rth->rt_dst     = daddr;
1780         rth->rt_src     = saddr;
1781         rth->rt_route_iif = dev->ifindex;
1782         rth->rt_iif     = dev->ifindex;
1783         rth->rt_oif     = 0;
1784         rth->rt_mark    = skb->mark;
1785         rth->rt_pmtu    = 0;
1786         rth->rt_gateway = daddr;
1787         rth->fi = NULL;
1788         if (our) {
1789                 rth->dst.input= ip_local_deliver;
1790                 rth->rt_flags |= RTCF_LOCAL;
1791         }
1792
1793 #ifdef CONFIG_IP_MROUTE
1794         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1795                 rth->dst.input = ip_mr_input;
1796 #endif
1797         RT_CACHE_STAT_INC(in_slow_mc);
1798
1799         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1800         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1801         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1802
1803 e_nobufs:
1804         return -ENOBUFS;
1805 e_inval:
1806         return -EINVAL;
1807 e_err:
1808         return err;
1809 }
1810
1811
1812 static void ip_handle_martian_source(struct net_device *dev,
1813                                      struct in_device *in_dev,
1814                                      struct sk_buff *skb,
1815                                      __be32 daddr,
1816                                      __be32 saddr)
1817 {
1818         RT_CACHE_STAT_INC(in_martian_src);
1819 #ifdef CONFIG_IP_ROUTE_VERBOSE
1820         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1821                 /*
1822                  *      RFC1812 recommendation, if source is martian,
1823                  *      the only hint is MAC header.
1824                  */
1825                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1826                         &daddr, &saddr, dev->name);
1827                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1828                         print_hex_dump(KERN_WARNING, "ll header: ",
1829                                        DUMP_PREFIX_OFFSET, 16, 1,
1830                                        skb_mac_header(skb),
1831                                        dev->hard_header_len, true);
1832                 }
1833         }
1834 #endif
1835 }
1836
1837 /* called in rcu_read_lock() section */
1838 static int __mkroute_input(struct sk_buff *skb,
1839                            const struct fib_result *res,
1840                            struct in_device *in_dev,
1841                            __be32 daddr, __be32 saddr, u32 tos,
1842                            struct rtable **result)
1843 {
1844         struct rtable *rth;
1845         int err;
1846         struct in_device *out_dev;
1847         unsigned int flags = 0;
1848         u32 itag;
1849
1850         /* get a working reference to the output device */
1851         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1852         if (out_dev == NULL) {
1853                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1854                 return -EINVAL;
1855         }
1856
1857
1858         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1859                                   in_dev->dev, in_dev, &itag);
1860         if (err < 0) {
1861                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1862                                          saddr);
1863
1864                 goto cleanup;
1865         }
1866
1867         if (err)
1868                 flags |= RTCF_DIRECTSRC;
1869
1870         if (out_dev == in_dev && err &&
1871             (IN_DEV_SHARED_MEDIA(out_dev) ||
1872              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1873                 flags |= RTCF_DOREDIRECT;
1874
1875         if (skb->protocol != htons(ETH_P_IP)) {
1876                 /* Not IP (i.e. ARP). Do not create route, if it is
1877                  * invalid for proxy arp. DNAT routes are always valid.
1878                  *
1879                  * Proxy arp feature have been extended to allow, ARP
1880                  * replies back to the same interface, to support
1881                  * Private VLAN switch technologies. See arp.c.
1882                  */
1883                 if (out_dev == in_dev &&
1884                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1885                         err = -EINVAL;
1886                         goto cleanup;
1887                 }
1888         }
1889
1890         rth = rt_dst_alloc(out_dev->dev,
1891                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1892                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1893         if (!rth) {
1894                 err = -ENOBUFS;
1895                 goto cleanup;
1896         }
1897
1898         rth->rt_key_dst = daddr;
1899         rth->rt_key_src = saddr;
1900         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1901         rth->rt_flags = flags;
1902         rth->rt_type = res->type;
1903         rth->rt_key_tos = tos;
1904         rth->rt_dst     = daddr;
1905         rth->rt_src     = saddr;
1906         rth->rt_route_iif = in_dev->dev->ifindex;
1907         rth->rt_iif     = in_dev->dev->ifindex;
1908         rth->rt_oif     = 0;
1909         rth->rt_mark    = skb->mark;
1910         rth->rt_pmtu    = 0;
1911         rth->rt_gateway = daddr;
1912         rth->fi = NULL;
1913
1914         rth->dst.input = ip_forward;
1915         rth->dst.output = ip_output;
1916
1917         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1918
1919         *result = rth;
1920         err = 0;
1921  cleanup:
1922         return err;
1923 }
1924
1925 static int ip_mkroute_input(struct sk_buff *skb,
1926                             struct fib_result *res,
1927                             const struct flowi4 *fl4,
1928                             struct in_device *in_dev,
1929                             __be32 daddr, __be32 saddr, u32 tos)
1930 {
1931         struct rtable *rth = NULL;
1932         int err;
1933         unsigned int hash;
1934
1935 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1936         if (res->fi && res->fi->fib_nhs > 1)
1937                 fib_select_multipath(res);
1938 #endif
1939
1940         /* create a routing cache entry */
1941         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1942         if (err)
1943                 return err;
1944
1945         /* put it into the cache */
1946         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
1947                        rt_genid(dev_net(rth->dst.dev)));
1948         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
1949         if (IS_ERR(rth))
1950                 return PTR_ERR(rth);
1951         return 0;
1952 }
1953
1954 /*
1955  *      NOTE. We drop all the packets that has local source
1956  *      addresses, because every properly looped back packet
1957  *      must have correct destination already attached by output routine.
1958  *
1959  *      Such approach solves two big problems:
1960  *      1. Not simplex devices are handled properly.
1961  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1962  *      called with rcu_read_lock()
1963  */
1964
1965 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1966                                u8 tos, struct net_device *dev)
1967 {
1968         struct fib_result res;
1969         struct in_device *in_dev = __in_dev_get_rcu(dev);
1970         struct flowi4   fl4;
1971         unsigned int    flags = 0;
1972         u32             itag = 0;
1973         struct rtable   *rth;
1974         unsigned int    hash;
1975         int             err = -EINVAL;
1976         struct net    *net = dev_net(dev);
1977
1978         /* IP on this device is disabled. */
1979
1980         if (!in_dev)
1981                 goto out;
1982
1983         /* Check for the most weird martians, which can be not detected
1984            by fib_lookup.
1985          */
1986
1987         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1988                 goto martian_source;
1989
1990         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1991                 goto brd_input;
1992
1993         /* Accept zero addresses only to limited broadcast;
1994          * I even do not know to fix it or not. Waiting for complains :-)
1995          */
1996         if (ipv4_is_zeronet(saddr))
1997                 goto martian_source;
1998
1999         if (ipv4_is_zeronet(daddr))
2000                 goto martian_destination;
2001
2002         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2003                 if (ipv4_is_loopback(daddr))
2004                         goto martian_destination;
2005
2006                 if (ipv4_is_loopback(saddr))
2007                         goto martian_source;
2008         }
2009
2010         /*
2011          *      Now we are ready to route packet.
2012          */
2013         fl4.flowi4_oif = 0;
2014         fl4.flowi4_iif = dev->ifindex;
2015         fl4.flowi4_mark = skb->mark;
2016         fl4.flowi4_tos = tos;
2017         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2018         fl4.daddr = daddr;
2019         fl4.saddr = saddr;
2020         err = fib_lookup(net, &fl4, &res);
2021         if (err != 0)
2022                 goto no_route;
2023
2024         RT_CACHE_STAT_INC(in_slow_tot);
2025
2026         if (res.type == RTN_BROADCAST)
2027                 goto brd_input;
2028
2029         if (res.type == RTN_LOCAL) {
2030                 err = fib_validate_source(skb, saddr, daddr, tos,
2031                                           net->loopback_dev->ifindex,
2032                                           dev, in_dev, &itag);
2033                 if (err < 0)
2034                         goto martian_source_keep_err;
2035                 if (err)
2036                         flags |= RTCF_DIRECTSRC;
2037                 goto local_input;
2038         }
2039
2040         if (!IN_DEV_FORWARD(in_dev))
2041                 goto no_route;
2042         if (res.type != RTN_UNICAST)
2043                 goto martian_destination;
2044
2045         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2046 out:    return err;
2047
2048 brd_input:
2049         if (skb->protocol != htons(ETH_P_IP))
2050                 goto e_inval;
2051
2052         if (!ipv4_is_zeronet(saddr)) {
2053                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2054                                           in_dev, &itag);
2055                 if (err < 0)
2056                         goto martian_source_keep_err;
2057                 if (err)
2058                         flags |= RTCF_DIRECTSRC;
2059         }
2060         flags |= RTCF_BROADCAST;
2061         res.type = RTN_BROADCAST;
2062         RT_CACHE_STAT_INC(in_brd);
2063
2064 local_input:
2065         rth = rt_dst_alloc(net->loopback_dev,
2066                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2067         if (!rth)
2068                 goto e_nobufs;
2069
2070         rth->dst.input= ip_local_deliver;
2071         rth->dst.output= ip_rt_bug;
2072 #ifdef CONFIG_IP_ROUTE_CLASSID
2073         rth->dst.tclassid = itag;
2074 #endif
2075
2076         rth->rt_key_dst = daddr;
2077         rth->rt_key_src = saddr;
2078         rth->rt_genid = rt_genid(net);
2079         rth->rt_flags   = flags|RTCF_LOCAL;
2080         rth->rt_type    = res.type;
2081         rth->rt_key_tos = tos;
2082         rth->rt_dst     = daddr;
2083         rth->rt_src     = saddr;
2084         rth->rt_route_iif = dev->ifindex;
2085         rth->rt_iif     = dev->ifindex;
2086         rth->rt_oif     = 0;
2087         rth->rt_mark    = skb->mark;
2088         rth->rt_pmtu    = 0;
2089         rth->rt_gateway = daddr;
2090         rth->fi = NULL;
2091         if (res.type == RTN_UNREACHABLE) {
2092                 rth->dst.input= ip_error;
2093                 rth->dst.error= -err;
2094                 rth->rt_flags   &= ~RTCF_LOCAL;
2095         }
2096         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2097         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2098         err = 0;
2099         if (IS_ERR(rth))
2100                 err = PTR_ERR(rth);
2101         goto out;
2102
2103 no_route:
2104         RT_CACHE_STAT_INC(in_no_route);
2105         res.type = RTN_UNREACHABLE;
2106         if (err == -ESRCH)
2107                 err = -ENETUNREACH;
2108         goto local_input;
2109
2110         /*
2111          *      Do not cache martian addresses: they should be logged (RFC1812)
2112          */
2113 martian_destination:
2114         RT_CACHE_STAT_INC(in_martian_dst);
2115 #ifdef CONFIG_IP_ROUTE_VERBOSE
2116         if (IN_DEV_LOG_MARTIANS(in_dev))
2117                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2118                                      &daddr, &saddr, dev->name);
2119 #endif
2120
2121 e_inval:
2122         err = -EINVAL;
2123         goto out;
2124
2125 e_nobufs:
2126         err = -ENOBUFS;
2127         goto out;
2128
2129 martian_source:
2130         err = -EINVAL;
2131 martian_source_keep_err:
2132         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2133         goto out;
2134 }
2135
2136 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2137                            u8 tos, struct net_device *dev, bool noref)
2138 {
2139         struct rtable   *rth;
2140         unsigned int    hash;
2141         int iif = dev->ifindex;
2142         struct net *net;
2143         int res;
2144
2145         net = dev_net(dev);
2146
2147         rcu_read_lock();
2148
2149         if (!rt_caching(net))
2150                 goto skip_cache;
2151
2152         tos &= IPTOS_RT_MASK;
2153         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2154
2155         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2156              rth = rcu_dereference(rth->dst.rt_next)) {
2157                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2158                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2159                      (rth->rt_route_iif ^ iif) |
2160                      (rth->rt_key_tos ^ tos)) == 0 &&
2161                     rth->rt_mark == skb->mark &&
2162                     net_eq(dev_net(rth->dst.dev), net) &&
2163                     !rt_is_expired(rth)) {
2164                         if (noref) {
2165                                 dst_use_noref(&rth->dst, jiffies);
2166                                 skb_dst_set_noref(skb, &rth->dst);
2167                         } else {
2168                                 dst_use(&rth->dst, jiffies);
2169                                 skb_dst_set(skb, &rth->dst);
2170                         }
2171                         RT_CACHE_STAT_INC(in_hit);
2172                         rcu_read_unlock();
2173                         return 0;
2174                 }
2175                 RT_CACHE_STAT_INC(in_hlist_search);
2176         }
2177
2178 skip_cache:
2179         /* Multicast recognition logic is moved from route cache to here.
2180            The problem was that too many Ethernet cards have broken/missing
2181            hardware multicast filters :-( As result the host on multicasting
2182            network acquires a lot of useless route cache entries, sort of
2183            SDR messages from all the world. Now we try to get rid of them.
2184            Really, provided software IP multicast filter is organized
2185            reasonably (at least, hashed), it does not result in a slowdown
2186            comparing with route cache reject entries.
2187            Note, that multicast routers are not affected, because
2188            route cache entry is created eventually.
2189          */
2190         if (ipv4_is_multicast(daddr)) {
2191                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2192
2193                 if (in_dev) {
2194                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2195                                                   ip_hdr(skb)->protocol);
2196                         if (our
2197 #ifdef CONFIG_IP_MROUTE
2198                                 ||
2199                             (!ipv4_is_local_multicast(daddr) &&
2200                              IN_DEV_MFORWARD(in_dev))
2201 #endif
2202                            ) {
2203                                 int res = ip_route_input_mc(skb, daddr, saddr,
2204                                                             tos, dev, our);
2205                                 rcu_read_unlock();
2206                                 return res;
2207                         }
2208                 }
2209                 rcu_read_unlock();
2210                 return -EINVAL;
2211         }
2212         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2213         rcu_read_unlock();
2214         return res;
2215 }
2216 EXPORT_SYMBOL(ip_route_input_common);
2217
2218 /* called with rcu_read_lock() */
2219 static struct rtable *__mkroute_output(const struct fib_result *res,
2220                                        const struct flowi4 *fl4,
2221                                        __be32 orig_daddr, __be32 orig_saddr,
2222                                        int orig_oif, __u8 orig_rtos,
2223                                        struct net_device *dev_out,
2224                                        unsigned int flags)
2225 {
2226         struct fib_info *fi = res->fi;
2227         struct in_device *in_dev;
2228         u16 type = res->type;
2229         struct rtable *rth;
2230
2231         in_dev = __in_dev_get_rcu(dev_out);
2232         if (!in_dev)
2233                 return ERR_PTR(-EINVAL);
2234
2235         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2236                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2237                         return ERR_PTR(-EINVAL);
2238
2239         if (ipv4_is_lbcast(fl4->daddr))
2240                 type = RTN_BROADCAST;
2241         else if (ipv4_is_multicast(fl4->daddr))
2242                 type = RTN_MULTICAST;
2243         else if (ipv4_is_zeronet(fl4->daddr))
2244                 return ERR_PTR(-EINVAL);
2245
2246         if (dev_out->flags & IFF_LOOPBACK)
2247                 flags |= RTCF_LOCAL;
2248
2249         if (type == RTN_BROADCAST) {
2250                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2251                 fi = NULL;
2252         } else if (type == RTN_MULTICAST) {
2253                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2254                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2255                                      fl4->flowi4_proto))
2256                         flags &= ~RTCF_LOCAL;
2257                 /* If multicast route do not exist use
2258                  * default one, but do not gateway in this case.
2259                  * Yes, it is hack.
2260                  */
2261                 if (fi && res->prefixlen < 4)
2262                         fi = NULL;
2263         }
2264
2265         rth = rt_dst_alloc(dev_out,
2266                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2267                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2268         if (!rth)
2269                 return ERR_PTR(-ENOBUFS);
2270
2271         rth->dst.output = ip_output;
2272
2273         rth->rt_key_dst = orig_daddr;
2274         rth->rt_key_src = orig_saddr;
2275         rth->rt_genid = rt_genid(dev_net(dev_out));
2276         rth->rt_flags   = flags;
2277         rth->rt_type    = type;
2278         rth->rt_key_tos = orig_rtos;
2279         rth->rt_dst     = fl4->daddr;
2280         rth->rt_src     = fl4->saddr;
2281         rth->rt_route_iif = 0;
2282         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2283         rth->rt_oif     = orig_oif;
2284         rth->rt_mark    = fl4->flowi4_mark;
2285         rth->rt_pmtu    = 0;
2286         rth->rt_gateway = fl4->daddr;
2287         rth->fi = NULL;
2288
2289         RT_CACHE_STAT_INC(out_slow_tot);
2290
2291         if (flags & RTCF_LOCAL)
2292                 rth->dst.input = ip_local_deliver;
2293         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2294                 if (flags & RTCF_LOCAL &&
2295                     !(dev_out->flags & IFF_LOOPBACK)) {
2296                         rth->dst.output = ip_mc_output;
2297                         RT_CACHE_STAT_INC(out_slow_mc);
2298                 }
2299 #ifdef CONFIG_IP_MROUTE
2300                 if (type == RTN_MULTICAST) {
2301                         if (IN_DEV_MFORWARD(in_dev) &&
2302                             !ipv4_is_local_multicast(fl4->daddr)) {
2303                                 rth->dst.input = ip_mr_input;
2304                                 rth->dst.output = ip_mc_output;
2305                         }
2306                 }
2307 #endif
2308         }
2309
2310         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2311
2312         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2313                 rth->dst.flags |= DST_NOCACHE;
2314
2315         return rth;
2316 }
2317
2318 /*
2319  * Major route resolver routine.
2320  * called with rcu_read_lock();
2321  */
2322
2323 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2324 {
2325         struct net_device *dev_out = NULL;
2326         __u8 tos = RT_FL_TOS(fl4);
2327         unsigned int flags = 0;
2328         struct fib_result res;
2329         struct rtable *rth;
2330         __be32 orig_daddr;
2331         __be32 orig_saddr;
2332         int orig_oif;
2333
2334         res.fi          = NULL;
2335         res.table       = NULL;
2336 #ifdef CONFIG_IP_MULTIPLE_TABLES
2337         res.r           = NULL;
2338 #endif
2339
2340         orig_daddr = fl4->daddr;
2341         orig_saddr = fl4->saddr;
2342         orig_oif = fl4->flowi4_oif;
2343
2344         fl4->flowi4_iif = net->loopback_dev->ifindex;
2345         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2346         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2347                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2348
2349         rcu_read_lock();
2350         if (fl4->saddr) {
2351                 rth = ERR_PTR(-EINVAL);
2352                 if (ipv4_is_multicast(fl4->saddr) ||
2353                     ipv4_is_lbcast(fl4->saddr) ||
2354                     ipv4_is_zeronet(fl4->saddr))
2355                         goto out;
2356
2357                 /* I removed check for oif == dev_out->oif here.
2358                    It was wrong for two reasons:
2359                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2360                       is assigned to multiple interfaces.
2361                    2. Moreover, we are allowed to send packets with saddr
2362                       of another iface. --ANK
2363                  */
2364
2365                 if (fl4->flowi4_oif == 0 &&
2366                     (ipv4_is_multicast(fl4->daddr) ||
2367                      ipv4_is_lbcast(fl4->daddr))) {
2368                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2369                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2370                         if (dev_out == NULL)
2371                                 goto out;
2372
2373                         /* Special hack: user can direct multicasts
2374                            and limited broadcast via necessary interface
2375                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2376                            This hack is not just for fun, it allows
2377                            vic,vat and friends to work.
2378                            They bind socket to loopback, set ttl to zero
2379                            and expect that it will work.
2380                            From the viewpoint of routing cache they are broken,
2381                            because we are not allowed to build multicast path
2382                            with loopback source addr (look, routing cache
2383                            cannot know, that ttl is zero, so that packet
2384                            will not leave this host and route is valid).
2385                            Luckily, this hack is good workaround.
2386                          */
2387
2388                         fl4->flowi4_oif = dev_out->ifindex;
2389                         goto make_route;
2390                 }
2391
2392                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2393                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2394                         if (!__ip_dev_find(net, fl4->saddr, false))
2395                                 goto out;
2396                 }
2397         }
2398
2399
2400         if (fl4->flowi4_oif) {
2401                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2402                 rth = ERR_PTR(-ENODEV);
2403                 if (dev_out == NULL)
2404                         goto out;
2405
2406                 /* RACE: Check return value of inet_select_addr instead. */
2407                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2408                         rth = ERR_PTR(-ENETUNREACH);
2409                         goto out;
2410                 }
2411                 if (ipv4_is_local_multicast(fl4->daddr) ||
2412                     ipv4_is_lbcast(fl4->daddr)) {
2413                         if (!fl4->saddr)
2414                                 fl4->saddr = inet_select_addr(dev_out, 0,
2415                                                               RT_SCOPE_LINK);
2416                         goto make_route;
2417                 }
2418                 if (fl4->saddr) {
2419                         if (ipv4_is_multicast(fl4->daddr))
2420                                 fl4->saddr = inet_select_addr(dev_out, 0,
2421                                                               fl4->flowi4_scope);
2422                         else if (!fl4->daddr)
2423                                 fl4->saddr = inet_select_addr(dev_out, 0,
2424                                                               RT_SCOPE_HOST);
2425                 }
2426         }
2427
2428         if (!fl4->daddr) {
2429                 fl4->daddr = fl4->saddr;
2430                 if (!fl4->daddr)
2431                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2432                 dev_out = net->loopback_dev;
2433                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2434                 res.type = RTN_LOCAL;
2435                 flags |= RTCF_LOCAL;
2436                 goto make_route;
2437         }
2438
2439         if (fib_lookup(net, fl4, &res)) {
2440                 res.fi = NULL;
2441                 res.table = NULL;
2442                 if (fl4->flowi4_oif) {
2443                         /* Apparently, routing tables are wrong. Assume,
2444                            that the destination is on link.
2445
2446                            WHY? DW.
2447                            Because we are allowed to send to iface
2448                            even if it has NO routes and NO assigned
2449                            addresses. When oif is specified, routing
2450                            tables are looked up with only one purpose:
2451                            to catch if destination is gatewayed, rather than
2452                            direct. Moreover, if MSG_DONTROUTE is set,
2453                            we send packet, ignoring both routing tables
2454                            and ifaddr state. --ANK
2455
2456
2457                            We could make it even if oif is unknown,
2458                            likely IPv6, but we do not.
2459                          */
2460
2461                         if (fl4->saddr == 0)
2462                                 fl4->saddr = inet_select_addr(dev_out, 0,
2463                                                               RT_SCOPE_LINK);
2464                         res.type = RTN_UNICAST;
2465                         goto make_route;
2466                 }
2467                 rth = ERR_PTR(-ENETUNREACH);
2468                 goto out;
2469         }
2470
2471         if (res.type == RTN_LOCAL) {
2472                 if (!fl4->saddr) {
2473                         if (res.fi->fib_prefsrc)
2474                                 fl4->saddr = res.fi->fib_prefsrc;
2475                         else
2476                                 fl4->saddr = fl4->daddr;
2477                 }
2478                 dev_out = net->loopback_dev;
2479                 fl4->flowi4_oif = dev_out->ifindex;
2480                 res.fi = NULL;
2481                 flags |= RTCF_LOCAL;
2482                 goto make_route;
2483         }
2484
2485 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2486         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2487                 fib_select_multipath(&res);
2488         else
2489 #endif
2490         if (!res.prefixlen &&
2491             res.table->tb_num_default > 1 &&
2492             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2493                 fib_select_default(&res);
2494
2495         if (!fl4->saddr)
2496                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2497
2498         dev_out = FIB_RES_DEV(res);
2499         fl4->flowi4_oif = dev_out->ifindex;
2500
2501
2502 make_route:
2503         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2504                                tos, dev_out, flags);
2505         if (!IS_ERR(rth)) {
2506                 unsigned int hash;
2507
2508                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2509                                rt_genid(dev_net(dev_out)));
2510                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2511         }
2512
2513 out:
2514         rcu_read_unlock();
2515         return rth;
2516 }
2517
2518 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2519 {
2520         struct rtable *rth;
2521         unsigned int hash;
2522
2523         if (!rt_caching(net))
2524                 goto slow_output;
2525
2526         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2527
2528         rcu_read_lock_bh();
2529         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2530                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2531                 if (rth->rt_key_dst == flp4->daddr &&
2532                     rth->rt_key_src == flp4->saddr &&
2533                     rt_is_output_route(rth) &&
2534                     rth->rt_oif == flp4->flowi4_oif &&
2535                     rth->rt_mark == flp4->flowi4_mark &&
2536                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2537                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2538                     net_eq(dev_net(rth->dst.dev), net) &&
2539                     !rt_is_expired(rth)) {
2540                         dst_use(&rth->dst, jiffies);
2541                         RT_CACHE_STAT_INC(out_hit);
2542                         rcu_read_unlock_bh();
2543                         if (!flp4->saddr)
2544                                 flp4->saddr = rth->rt_src;
2545                         if (!flp4->daddr)
2546                                 flp4->daddr = rth->rt_dst;
2547                         return rth;
2548                 }
2549                 RT_CACHE_STAT_INC(out_hlist_search);
2550         }
2551         rcu_read_unlock_bh();
2552
2553 slow_output:
2554         return ip_route_output_slow(net, flp4);
2555 }
2556 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2557
2558 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2559 {
2560         return NULL;
2561 }
2562
2563 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2564 {
2565         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2566
2567         return mtu ? : dst->dev->mtu;
2568 }
2569
2570 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2571 {
2572 }
2573
2574 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2575                                           unsigned long old)
2576 {
2577         return NULL;
2578 }
2579
2580 static struct dst_ops ipv4_dst_blackhole_ops = {
2581         .family                 =       AF_INET,
2582         .protocol               =       cpu_to_be16(ETH_P_IP),
2583         .destroy                =       ipv4_dst_destroy,
2584         .check                  =       ipv4_blackhole_dst_check,
2585         .mtu                    =       ipv4_blackhole_mtu,
2586         .default_advmss         =       ipv4_default_advmss,
2587         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2588         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2589         .neigh_lookup           =       ipv4_neigh_lookup,
2590 };
2591
2592 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2593 {
2594         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2595         struct rtable *ort = (struct rtable *) dst_orig;
2596
2597         if (rt) {
2598                 struct dst_entry *new = &rt->dst;
2599
2600                 new->__use = 1;
2601                 new->input = dst_discard;
2602                 new->output = dst_discard;
2603
2604                 new->dev = ort->dst.dev;
2605                 if (new->dev)
2606                         dev_hold(new->dev);
2607
2608                 rt->rt_key_dst = ort->rt_key_dst;
2609                 rt->rt_key_src = ort->rt_key_src;
2610                 rt->rt_key_tos = ort->rt_key_tos;
2611                 rt->rt_route_iif = ort->rt_route_iif;
2612                 rt->rt_iif = ort->rt_iif;
2613                 rt->rt_oif = ort->rt_oif;
2614                 rt->rt_mark = ort->rt_mark;
2615                 rt->rt_pmtu = ort->rt_pmtu;
2616
2617                 rt->rt_genid = rt_genid(net);
2618                 rt->rt_flags = ort->rt_flags;
2619                 rt->rt_type = ort->rt_type;
2620                 rt->rt_dst = ort->rt_dst;
2621                 rt->rt_src = ort->rt_src;
2622                 rt->rt_gateway = ort->rt_gateway;
2623                 rt->fi = ort->fi;
2624                 if (rt->fi)
2625                         atomic_inc(&rt->fi->fib_clntref);
2626
2627                 dst_free(new);
2628         }
2629
2630         dst_release(dst_orig);
2631
2632         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2633 }
2634
2635 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2636                                     struct sock *sk)
2637 {
2638         struct rtable *rt = __ip_route_output_key(net, flp4);
2639
2640         if (IS_ERR(rt))
2641                 return rt;
2642
2643         if (flp4->flowi4_proto)
2644                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2645                                                    flowi4_to_flowi(flp4),
2646                                                    sk, 0);
2647
2648         return rt;
2649 }
2650 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2651
2652 static int rt_fill_info(struct net *net,
2653                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2654                         int nowait, unsigned int flags)
2655 {
2656         struct rtable *rt = skb_rtable(skb);
2657         struct rtmsg *r;
2658         struct nlmsghdr *nlh;
2659         unsigned long expires = 0;
2660         u32 error;
2661
2662         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2663         if (nlh == NULL)
2664                 return -EMSGSIZE;
2665
2666         r = nlmsg_data(nlh);
2667         r->rtm_family    = AF_INET;
2668         r->rtm_dst_len  = 32;
2669         r->rtm_src_len  = 0;
2670         r->rtm_tos      = rt->rt_key_tos;
2671         r->rtm_table    = RT_TABLE_MAIN;
2672         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2673                 goto nla_put_failure;
2674         r->rtm_type     = rt->rt_type;
2675         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2676         r->rtm_protocol = RTPROT_UNSPEC;
2677         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2678         if (rt->rt_flags & RTCF_NOTIFY)
2679                 r->rtm_flags |= RTM_F_NOTIFY;
2680
2681         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2682                 goto nla_put_failure;
2683         if (rt->rt_key_src) {
2684                 r->rtm_src_len = 32;
2685                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2686                         goto nla_put_failure;
2687         }
2688         if (rt->dst.dev &&
2689             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2690                 goto nla_put_failure;
2691 #ifdef CONFIG_IP_ROUTE_CLASSID
2692         if (rt->dst.tclassid &&
2693             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2694                 goto nla_put_failure;
2695 #endif
2696         if (!rt_is_input_route(rt) &&
2697             rt->rt_src != rt->rt_key_src) {
2698                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2699                         goto nla_put_failure;
2700         }
2701         if (rt->rt_dst != rt->rt_gateway &&
2702             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2703                 goto nla_put_failure;
2704
2705         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2706                 goto nla_put_failure;
2707
2708         if (rt->rt_mark &&
2709             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2710                 goto nla_put_failure;
2711
2712         error = rt->dst.error;
2713         expires = rt->dst.expires;
2714         if (expires) {
2715                 if (time_before(jiffies, expires))
2716                         expires -= jiffies;
2717                 else
2718                         expires = 0;
2719         }
2720
2721         if (rt_is_input_route(rt)) {
2722 #ifdef CONFIG_IP_MROUTE
2723                 __be32 dst = rt->rt_dst;
2724
2725                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2726                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2727                         int err = ipmr_get_route(net, skb,
2728                                                  rt->rt_src, rt->rt_dst,
2729                                                  r, nowait);
2730                         if (err <= 0) {
2731                                 if (!nowait) {
2732                                         if (err == 0)
2733                                                 return 0;
2734                                         goto nla_put_failure;
2735                                 } else {
2736                                         if (err == -EMSGSIZE)
2737                                                 goto nla_put_failure;
2738                                         error = err;
2739                                 }
2740                         }
2741                 } else
2742 #endif
2743                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2744                                 goto nla_put_failure;
2745         }
2746
2747         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2748                 goto nla_put_failure;
2749
2750         return nlmsg_end(skb, nlh);
2751
2752 nla_put_failure:
2753         nlmsg_cancel(skb, nlh);
2754         return -EMSGSIZE;
2755 }
2756
2757 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2758 {
2759         struct net *net = sock_net(in_skb->sk);
2760         struct rtmsg *rtm;
2761         struct nlattr *tb[RTA_MAX+1];
2762         struct rtable *rt = NULL;
2763         __be32 dst = 0;
2764         __be32 src = 0;
2765         u32 iif;
2766         int err;
2767         int mark;
2768         struct sk_buff *skb;
2769
2770         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2771         if (err < 0)
2772                 goto errout;
2773
2774         rtm = nlmsg_data(nlh);
2775
2776         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2777         if (skb == NULL) {
2778                 err = -ENOBUFS;
2779                 goto errout;
2780         }
2781
2782         /* Reserve room for dummy headers, this skb can pass
2783            through good chunk of routing engine.
2784          */
2785         skb_reset_mac_header(skb);
2786         skb_reset_network_header(skb);
2787
2788         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2789         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2790         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2791
2792         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2793         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2794         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2795         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2796
2797         if (iif) {
2798                 struct net_device *dev;
2799
2800                 dev = __dev_get_by_index(net, iif);
2801                 if (dev == NULL) {
2802                         err = -ENODEV;
2803                         goto errout_free;
2804                 }
2805
2806                 skb->protocol   = htons(ETH_P_IP);
2807                 skb->dev        = dev;
2808                 skb->mark       = mark;
2809                 local_bh_disable();
2810                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2811                 local_bh_enable();
2812
2813                 rt = skb_rtable(skb);
2814                 if (err == 0 && rt->dst.error)
2815                         err = -rt->dst.error;
2816         } else {
2817                 struct flowi4 fl4 = {
2818                         .daddr = dst,
2819                         .saddr = src,
2820                         .flowi4_tos = rtm->rtm_tos,
2821                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2822                         .flowi4_mark = mark,
2823                 };
2824                 rt = ip_route_output_key(net, &fl4);
2825
2826                 err = 0;
2827                 if (IS_ERR(rt))
2828                         err = PTR_ERR(rt);
2829         }
2830
2831         if (err)
2832                 goto errout_free;
2833
2834         skb_dst_set(skb, &rt->dst);
2835         if (rtm->rtm_flags & RTM_F_NOTIFY)
2836                 rt->rt_flags |= RTCF_NOTIFY;
2837
2838         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2839                            RTM_NEWROUTE, 0, 0);
2840         if (err <= 0)
2841                 goto errout_free;
2842
2843         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2844 errout:
2845         return err;
2846
2847 errout_free:
2848         kfree_skb(skb);
2849         goto errout;
2850 }
2851
2852 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2853 {
2854         struct rtable *rt;
2855         int h, s_h;
2856         int idx, s_idx;
2857         struct net *net;
2858
2859         net = sock_net(skb->sk);
2860
2861         s_h = cb->args[0];
2862         if (s_h < 0)
2863                 s_h = 0;
2864         s_idx = idx = cb->args[1];
2865         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2866                 if (!rt_hash_table[h].chain)
2867                         continue;
2868                 rcu_read_lock_bh();
2869                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2870                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2871                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2872                                 continue;
2873                         if (rt_is_expired(rt))
2874                                 continue;
2875                         skb_dst_set_noref(skb, &rt->dst);
2876                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2877                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2878                                          1, NLM_F_MULTI) <= 0) {
2879                                 skb_dst_drop(skb);
2880                                 rcu_read_unlock_bh();
2881                                 goto done;
2882                         }
2883                         skb_dst_drop(skb);
2884                 }
2885                 rcu_read_unlock_bh();
2886         }
2887
2888 done:
2889         cb->args[0] = h;
2890         cb->args[1] = idx;
2891         return skb->len;
2892 }
2893
2894 void ip_rt_multicast_event(struct in_device *in_dev)
2895 {
2896         rt_cache_flush(dev_net(in_dev->dev), 0);
2897 }
2898
2899 #ifdef CONFIG_SYSCTL
2900 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2901                                         void __user *buffer,
2902                                         size_t *lenp, loff_t *ppos)
2903 {
2904         if (write) {
2905                 int flush_delay;
2906                 ctl_table ctl;
2907                 struct net *net;
2908
2909                 memcpy(&ctl, __ctl, sizeof(ctl));
2910                 ctl.data = &flush_delay;
2911                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2912
2913                 net = (struct net *)__ctl->extra1;
2914                 rt_cache_flush(net, flush_delay);
2915                 return 0;
2916         }
2917
2918         return -EINVAL;
2919 }
2920
2921 static ctl_table ipv4_route_table[] = {
2922         {
2923                 .procname       = "gc_thresh",
2924                 .data           = &ipv4_dst_ops.gc_thresh,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = proc_dointvec,
2928         },
2929         {
2930                 .procname       = "max_size",
2931                 .data           = &ip_rt_max_size,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = proc_dointvec,
2935         },
2936         {
2937                 /*  Deprecated. Use gc_min_interval_ms */
2938
2939                 .procname       = "gc_min_interval",
2940                 .data           = &ip_rt_gc_min_interval,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = proc_dointvec_jiffies,
2944         },
2945         {
2946                 .procname       = "gc_min_interval_ms",
2947                 .data           = &ip_rt_gc_min_interval,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = proc_dointvec_ms_jiffies,
2951         },
2952         {
2953                 .procname       = "gc_timeout",
2954                 .data           = &ip_rt_gc_timeout,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = proc_dointvec_jiffies,
2958         },
2959         {
2960                 .procname       = "gc_interval",
2961                 .data           = &ip_rt_gc_interval,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = proc_dointvec_jiffies,
2965         },
2966         {
2967                 .procname       = "redirect_load",
2968                 .data           = &ip_rt_redirect_load,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = proc_dointvec,
2972         },
2973         {
2974                 .procname       = "redirect_number",
2975                 .data           = &ip_rt_redirect_number,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = proc_dointvec,
2979         },
2980         {
2981                 .procname       = "redirect_silence",
2982                 .data           = &ip_rt_redirect_silence,
2983                 .maxlen         = sizeof(int),
2984                 .mode           = 0644,
2985                 .proc_handler   = proc_dointvec,
2986         },
2987         {
2988                 .procname       = "error_cost",
2989                 .data           = &ip_rt_error_cost,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = proc_dointvec,
2993         },
2994         {
2995                 .procname       = "error_burst",
2996                 .data           = &ip_rt_error_burst,
2997                 .maxlen         = sizeof(int),
2998                 .mode           = 0644,
2999                 .proc_handler   = proc_dointvec,
3000         },
3001         {
3002                 .procname       = "gc_elasticity",
3003                 .data           = &ip_rt_gc_elasticity,
3004                 .maxlen         = sizeof(int),
3005                 .mode           = 0644,
3006                 .proc_handler   = proc_dointvec,
3007         },
3008         {
3009                 .procname       = "mtu_expires",
3010                 .data           = &ip_rt_mtu_expires,
3011                 .maxlen         = sizeof(int),
3012                 .mode           = 0644,
3013                 .proc_handler   = proc_dointvec_jiffies,
3014         },
3015         {
3016                 .procname       = "min_pmtu",
3017                 .data           = &ip_rt_min_pmtu,
3018                 .maxlen         = sizeof(int),
3019                 .mode           = 0644,
3020                 .proc_handler   = proc_dointvec,
3021         },
3022         {
3023                 .procname       = "min_adv_mss",
3024                 .data           = &ip_rt_min_advmss,
3025                 .maxlen         = sizeof(int),
3026                 .mode           = 0644,
3027                 .proc_handler   = proc_dointvec,
3028         },
3029         { }
3030 };
3031
3032 static struct ctl_table ipv4_route_flush_table[] = {
3033         {
3034                 .procname       = "flush",
3035                 .maxlen         = sizeof(int),
3036                 .mode           = 0200,
3037                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3038         },
3039         { },
3040 };
3041
3042 static __net_init int sysctl_route_net_init(struct net *net)
3043 {
3044         struct ctl_table *tbl;
3045
3046         tbl = ipv4_route_flush_table;
3047         if (!net_eq(net, &init_net)) {
3048                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3049                 if (tbl == NULL)
3050                         goto err_dup;
3051         }
3052         tbl[0].extra1 = net;
3053
3054         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3055         if (net->ipv4.route_hdr == NULL)
3056                 goto err_reg;
3057         return 0;
3058
3059 err_reg:
3060         if (tbl != ipv4_route_flush_table)
3061                 kfree(tbl);
3062 err_dup:
3063         return -ENOMEM;
3064 }
3065
3066 static __net_exit void sysctl_route_net_exit(struct net *net)
3067 {
3068         struct ctl_table *tbl;
3069
3070         tbl = net->ipv4.route_hdr->ctl_table_arg;
3071         unregister_net_sysctl_table(net->ipv4.route_hdr);
3072         BUG_ON(tbl == ipv4_route_flush_table);
3073         kfree(tbl);
3074 }
3075
3076 static __net_initdata struct pernet_operations sysctl_route_ops = {
3077         .init = sysctl_route_net_init,
3078         .exit = sysctl_route_net_exit,
3079 };
3080 #endif
3081
3082 static __net_init int rt_genid_init(struct net *net)
3083 {
3084         get_random_bytes(&net->ipv4.rt_genid,
3085                          sizeof(net->ipv4.rt_genid));
3086         get_random_bytes(&net->ipv4.dev_addr_genid,
3087                          sizeof(net->ipv4.dev_addr_genid));
3088         return 0;
3089 }
3090
3091 static __net_initdata struct pernet_operations rt_genid_ops = {
3092         .init = rt_genid_init,
3093 };
3094
3095 static int __net_init ipv4_inetpeer_init(struct net *net)
3096 {
3097         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3098
3099         if (!bp)
3100                 return -ENOMEM;
3101         inet_peer_base_init(bp);
3102         net->ipv4.peers = bp;
3103         return 0;
3104 }
3105
3106 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3107 {
3108         struct inet_peer_base *bp = net->ipv4.peers;
3109
3110         net->ipv4.peers = NULL;
3111         inetpeer_invalidate_tree(bp);
3112         kfree(bp);
3113 }
3114
3115 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3116         .init   =       ipv4_inetpeer_init,
3117         .exit   =       ipv4_inetpeer_exit,
3118 };
3119
3120 #ifdef CONFIG_IP_ROUTE_CLASSID
3121 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3122 #endif /* CONFIG_IP_ROUTE_CLASSID */
3123
3124 static __initdata unsigned long rhash_entries;
3125 static int __init set_rhash_entries(char *str)
3126 {
3127         ssize_t ret;
3128
3129         if (!str)
3130                 return 0;
3131
3132         ret = kstrtoul(str, 0, &rhash_entries);
3133         if (ret)
3134                 return 0;
3135
3136         return 1;
3137 }
3138 __setup("rhash_entries=", set_rhash_entries);
3139
3140 int __init ip_rt_init(void)
3141 {
3142         int rc = 0;
3143
3144 #ifdef CONFIG_IP_ROUTE_CLASSID
3145         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3146         if (!ip_rt_acct)
3147                 panic("IP: failed to allocate ip_rt_acct\n");
3148 #endif
3149
3150         ipv4_dst_ops.kmem_cachep =
3151                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3152                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3153
3154         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3155
3156         if (dst_entries_init(&ipv4_dst_ops) < 0)
3157                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3158
3159         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3160                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3161
3162         rt_hash_table = (struct rt_hash_bucket *)
3163                 alloc_large_system_hash("IP route cache",
3164                                         sizeof(struct rt_hash_bucket),
3165                                         rhash_entries,
3166                                         (totalram_pages >= 128 * 1024) ?
3167                                         15 : 17,
3168                                         0,
3169                                         &rt_hash_log,
3170                                         &rt_hash_mask,
3171                                         0,
3172                                         rhash_entries ? 0 : 512 * 1024);
3173         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3174         rt_hash_lock_init();
3175
3176         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3177         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3178
3179         devinet_init();
3180         ip_fib_init();
3181
3182         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3183         expires_ljiffies = jiffies;
3184         schedule_delayed_work(&expires_work,
3185                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3186
3187         if (ip_rt_proc_init())
3188                 pr_err("Unable to create route proc files\n");
3189 #ifdef CONFIG_XFRM
3190         xfrm_init();
3191         xfrm4_init(ip_rt_max_size);
3192 #endif
3193         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3194
3195 #ifdef CONFIG_SYSCTL
3196         register_pernet_subsys(&sysctl_route_ops);
3197 #endif
3198         register_pernet_subsys(&rt_genid_ops);
3199         register_pernet_subsys(&ipv4_inetpeer_ops);
3200         return rc;
3201 }
3202
3203 #ifdef CONFIG_SYSCTL
3204 /*
3205  * We really need to sanitize the damn ipv4 init order, then all
3206  * this nonsense will go away.
3207  */
3208 void __init ip_static_sysctl_init(void)
3209 {
3210         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3211 }
3212 #endif