net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 152                                            struct sk_buff *skb, u32 mtu);
 153 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 154                                         struct sk_buff *skb);
 155 static int rt_garbage_collect(struct dst_ops *ops);
 156
 157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 158                             int how)
 159 {
 160 }
 161
 162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 163 {
 164         WARN_ON(1);
 165         return NULL;
 166 }
 167
 168 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 169                                            struct sk_buff *skb,
 170                                            const void *daddr);
 171
 172 static struct dst_ops ipv4_dst_ops = {
 173         .family =               AF_INET,
 174         .protocol =             cpu_to_be16(ETH_P_IP),
 175         .gc =                   rt_garbage_collect,
 176         .check =                ipv4_dst_check,
 177         .default_advmss =       ipv4_default_advmss,
 178         .mtu =                  ipv4_mtu,
 179         .cow_metrics =          ipv4_cow_metrics,
 180         .destroy =              ipv4_dst_destroy,
 181         .ifdown =               ipv4_dst_ifdown,
 182         .negative_advice =      ipv4_negative_advice,
 183         .link_failure =         ipv4_link_failure,
 184         .update_pmtu =          ip_rt_update_pmtu,
 185         .redirect =             ip_do_redirect,
 186         .local_out =            __ip_local_out,
 187         .neigh_lookup =         ipv4_neigh_lookup,
 188 };
 189
 190 #define ECN_OR_COST(class)      TC_PRIO_##class
 191
 192 const __u8 ip_tos2prio[16] = {
 193         TC_PRIO_BESTEFFORT,
 194         ECN_OR_COST(BESTEFFORT),
 195         TC_PRIO_BESTEFFORT,
 196         ECN_OR_COST(BESTEFFORT),
 197         TC_PRIO_BULK,
 198         ECN_OR_COST(BULK),
 199         TC_PRIO_BULK,
 200         ECN_OR_COST(BULK),
 201         TC_PRIO_INTERACTIVE,
 202         ECN_OR_COST(INTERACTIVE),
 203         TC_PRIO_INTERACTIVE,
 204         ECN_OR_COST(INTERACTIVE),
 205         TC_PRIO_INTERACTIVE_BULK,
 206         ECN_OR_COST(INTERACTIVE_BULK),
 207         TC_PRIO_INTERACTIVE_BULK,
 208         ECN_OR_COST(INTERACTIVE_BULK)
 209 };
 210 EXPORT_SYMBOL(ip_tos2prio);
 211
 212 /*
 213  * Route cache.
 214  */
 215
 216 /* The locking scheme is rather straight forward:
 217  *
 218  * 1) Read-Copy Update protects the buckets of the central route hash.
 219  * 2) Only writers remove entries, and they hold the lock
 220  *    as they look at rtable reference counts.
 221  * 3) Only readers acquire references to rtable entries,
 222  *    they do so with atomic increments and with the
 223  *    lock held.
 224  */
 225
 226 struct rt_hash_bucket {
 227         struct rtable __rcu     *chain;
 228 };
 229
 230 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 231         defined(CONFIG_PROVE_LOCKING)
 232 /*
 233  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 234  * The size of this table is a power of two and depends on the number of CPUS.
 235  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 236  */
 237 #ifdef CONFIG_LOCKDEP
 238 # define RT_HASH_LOCK_SZ        256
 239 #else
 240 # if NR_CPUS >= 32
 241 #  define RT_HASH_LOCK_SZ       4096
 242 # elif NR_CPUS >= 16
 243 #  define RT_HASH_LOCK_SZ       2048
 244 # elif NR_CPUS >= 8
 245 #  define RT_HASH_LOCK_SZ       1024
 246 # elif NR_CPUS >= 4
 247 #  define RT_HASH_LOCK_SZ       512
 248 # else
 249 #  define RT_HASH_LOCK_SZ       256
 250 # endif
 251 #endif
 252
 253 static spinlock_t       *rt_hash_locks;
 254 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 255
 256 static __init void rt_hash_lock_init(void)
 257 {
 258         int i;
 259
 260         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 261                         GFP_KERNEL);
 262         if (!rt_hash_locks)
 263                 panic("IP: failed to allocate rt_hash_locks\n");
 264
 265         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 266                 spin_lock_init(&rt_hash_locks[i]);
 267 }
 268 #else
 269 # define rt_hash_lock_addr(slot) NULL
 270
 271 static inline void rt_hash_lock_init(void)
 272 {
 273 }
 274 #endif
 275
 276 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 277 static unsigned int             rt_hash_mask __read_mostly;
 278 static unsigned int             rt_hash_log  __read_mostly;
 279
 280 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 281 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 282
 283 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 284                                    int genid)
 285 {
 286         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 287                             idx, genid)
 288                 & rt_hash_mask;
 289 }
 290
 291 static inline int rt_genid(struct net *net)
 292 {
 293         return atomic_read(&net->ipv4.rt_genid);
 294 }
 295
 296 #ifdef CONFIG_PROC_FS
 297 struct rt_cache_iter_state {
 298         struct seq_net_private p;
 299         int bucket;
 300         int genid;
 301 };
 302
 303 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306         struct rtable *r = NULL;
 307
 308         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 309                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 310                         continue;
 311                 rcu_read_lock_bh();
 312                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 313                 while (r) {
 314                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 315                             r->rt_genid == st->genid)
 316                                 return r;
 317                         r = rcu_dereference_bh(r->dst.rt_next);
 318                 }
 319                 rcu_read_unlock_bh();
 320         }
 321         return r;
 322 }
 323
 324 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 325                                           struct rtable *r)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328
 329         r = rcu_dereference_bh(r->dst.rt_next);
 330         while (!r) {
 331                 rcu_read_unlock_bh();
 332                 do {
 333                         if (--st->bucket < 0)
 334                                 return NULL;
 335                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 336                 rcu_read_lock_bh();
 337                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 343                                         struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 347                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 348                         continue;
 349                 if (r->rt_genid == st->genid)
 350                         break;
 351         }
 352         return r;
 353 }
 354
 355 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 356 {
 357         struct rtable *r = rt_cache_get_first(seq);
 358
 359         if (r)
 360                 while (pos && (r = rt_cache_get_next(seq, r)))
 361                         --pos;
 362         return pos ? NULL : r;
 363 }
 364
 365 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 366 {
 367         struct rt_cache_iter_state *st = seq->private;
 368         if (*pos)
 369                 return rt_cache_get_idx(seq, *pos - 1);
 370         st->genid = rt_genid(seq_file_net(seq));
 371         return SEQ_START_TOKEN;
 372 }
 373
 374 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 375 {
 376         struct rtable *r;
 377
 378         if (v == SEQ_START_TOKEN)
 379                 r = rt_cache_get_first(seq);
 380         else
 381                 r = rt_cache_get_next(seq, v);
 382         ++*pos;
 383         return r;
 384 }
 385
 386 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 387 {
 388         if (v && v != SEQ_START_TOKEN)
 389                 rcu_read_unlock_bh();
 390 }
 391
 392 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 393 {
 394         if (v == SEQ_START_TOKEN)
 395                 seq_printf(seq, "%-127s\n",
 396                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 397                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 398                            "HHUptod\tSpecDst");
 399         else {
 400                 struct rtable *r = v;
 401                 int len;
 402
 403                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 404                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 405                            r->dst.dev ? r->dst.dev->name : "*",
 406                            (__force u32)r->rt_dst,
 407                            (__force u32)r->rt_gateway,
 408                            r->rt_flags, atomic_read(&r->dst.__refcnt),
 409                            r->dst.__use, 0, (__force u32)r->rt_src,
 410                            dst_metric_advmss(&r->dst) + 40,
 411                            dst_metric(&r->dst, RTAX_WINDOW), 0,
 412                            r->rt_key_tos,
 413                            -1, 0, 0, &len);
 414
 415                 seq_printf(seq, "%*s\n", 127 - len, "");
 416         }
 417         return 0;
 418 }
 419
 420 static const struct seq_operations rt_cache_seq_ops = {
 421         .start  = rt_cache_seq_start,
 422         .next   = rt_cache_seq_next,
 423         .stop   = rt_cache_seq_stop,
 424         .show   = rt_cache_seq_show,
 425 };
 426
 427 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 428 {
 429         return seq_open_net(inode, file, &rt_cache_seq_ops,
 430                         sizeof(struct rt_cache_iter_state));
 431 }
 432
 433 static const struct file_operations rt_cache_seq_fops = {
 434         .owner   = THIS_MODULE,
 435         .open    = rt_cache_seq_open,
 436         .read    = seq_read,
 437         .llseek  = seq_lseek,
 438         .release = seq_release_net,
 439 };
 440
 441
 442 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 443 {
 444         int cpu;
 445
 446         if (*pos == 0)
 447                 return SEQ_START_TOKEN;
 448
 449         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 450                 if (!cpu_possible(cpu))
 451                         continue;
 452                 *pos = cpu+1;
 453                 return &per_cpu(rt_cache_stat, cpu);
 454         }
 455         return NULL;
 456 }
 457
 458 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 459 {
 460         int cpu;
 461
 462         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 463                 if (!cpu_possible(cpu))
 464                         continue;
 465                 *pos = cpu+1;
 466                 return &per_cpu(rt_cache_stat, cpu);
 467         }
 468         return NULL;
 469
 470 }
 471
 472 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 473 {
 474
 475 }
 476
 477 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 478 {
 479         struct rt_cache_stat *st = v;
 480
 481         if (v == SEQ_START_TOKEN) {
 482                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 483                 return 0;
 484         }
 485
 486         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 487                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 488                    dst_entries_get_slow(&ipv4_dst_ops),
 489                    st->in_hit,
 490                    st->in_slow_tot,
 491                    st->in_slow_mc,
 492                    st->in_no_route,
 493                    st->in_brd,
 494                    st->in_martian_dst,
 495                    st->in_martian_src,
 496
 497                    st->out_hit,
 498                    st->out_slow_tot,
 499                    st->out_slow_mc,
 500
 501                    st->gc_total,
 502                    st->gc_ignored,
 503                    st->gc_goal_miss,
 504                    st->gc_dst_overflow,
 505                    st->in_hlist_search,
 506                    st->out_hlist_search
 507                 );
 508         return 0;
 509 }
 510
 511 static const struct seq_operations rt_cpu_seq_ops = {
 512         .start  = rt_cpu_seq_start,
 513         .next   = rt_cpu_seq_next,
 514         .stop   = rt_cpu_seq_stop,
 515         .show   = rt_cpu_seq_show,
 516 };
 517
 518
 519 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 520 {
 521         return seq_open(file, &rt_cpu_seq_ops);
 522 }
 523
 524 static const struct file_operations rt_cpu_seq_fops = {
 525         .owner   = THIS_MODULE,
 526         .open    = rt_cpu_seq_open,
 527         .read    = seq_read,
 528         .llseek  = seq_lseek,
 529         .release = seq_release,
 530 };
 531
 532 #ifdef CONFIG_IP_ROUTE_CLASSID
 533 static int rt_acct_proc_show(struct seq_file *m, void *v)
 534 {
 535         struct ip_rt_acct *dst, *src;
 536         unsigned int i, j;
 537
 538         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 539         if (!dst)
 540                 return -ENOMEM;
 541
 542         for_each_possible_cpu(i) {
 543                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 544                 for (j = 0; j < 256; j++) {
 545                         dst[j].o_bytes   += src[j].o_bytes;
 546                         dst[j].o_packets += src[j].o_packets;
 547                         dst[j].i_bytes   += src[j].i_bytes;
 548                         dst[j].i_packets += src[j].i_packets;
 549                 }
 550         }
 551
 552         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 553         kfree(dst);
 554         return 0;
 555 }
 556
 557 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 558 {
 559         return single_open(file, rt_acct_proc_show, NULL);
 560 }
 561
 562 static const struct file_operations rt_acct_proc_fops = {
 563         .owner          = THIS_MODULE,
 564         .open           = rt_acct_proc_open,
 565         .read           = seq_read,
 566         .llseek         = seq_lseek,
 567         .release        = single_release,
 568 };
 569 #endif
 570
 571 static int __net_init ip_rt_do_proc_init(struct net *net)
 572 {
 573         struct proc_dir_entry *pde;
 574
 575         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 576                         &rt_cache_seq_fops);
 577         if (!pde)
 578                 goto err1;
 579
 580         pde = proc_create("rt_cache", S_IRUGO,
 581                           net->proc_net_stat, &rt_cpu_seq_fops);
 582         if (!pde)
 583                 goto err2;
 584
 585 #ifdef CONFIG_IP_ROUTE_CLASSID
 586         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 587         if (!pde)
 588                 goto err3;
 589 #endif
 590         return 0;
 591
 592 #ifdef CONFIG_IP_ROUTE_CLASSID
 593 err3:
 594         remove_proc_entry("rt_cache", net->proc_net_stat);
 595 #endif
 596 err2:
 597         remove_proc_entry("rt_cache", net->proc_net);
 598 err1:
 599         return -ENOMEM;
 600 }
 601
 602 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 603 {
 604         remove_proc_entry("rt_cache", net->proc_net_stat);
 605         remove_proc_entry("rt_cache", net->proc_net);
 606 #ifdef CONFIG_IP_ROUTE_CLASSID
 607         remove_proc_entry("rt_acct", net->proc_net);
 608 #endif
 609 }
 610
 611 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 612         .init = ip_rt_do_proc_init,
 613         .exit = ip_rt_do_proc_exit,
 614 };
 615
 616 static int __init ip_rt_proc_init(void)
 617 {
 618         return register_pernet_subsys(&ip_rt_proc_ops);
 619 }
 620
 621 #else
 622 static inline int ip_rt_proc_init(void)
 623 {
 624         return 0;
 625 }
 626 #endif /* CONFIG_PROC_FS */
 627
 628 static inline void rt_free(struct rtable *rt)
 629 {
 630         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 631 }
 632
 633 static inline void rt_drop(struct rtable *rt)
 634 {
 635         ip_rt_put(rt);
 636         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 637 }
 638
 639 static inline int rt_fast_clean(struct rtable *rth)
 640 {
 641         /* Kill broadcast/multicast entries very aggresively, if they
 642            collide in hash table with more useful entries */
 643         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 644                 rt_is_input_route(rth) && rth->dst.rt_next;
 645 }
 646
 647 static inline int rt_valuable(struct rtable *rth)
 648 {
 649         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 650                 rth->dst.expires;
 651 }
 652
 653 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 654 {
 655         unsigned long age;
 656         int ret = 0;
 657
 658         if (atomic_read(&rth->dst.__refcnt))
 659                 goto out;
 660
 661         age = jiffies - rth->dst.lastuse;
 662         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 663             (age <= tmo2 && rt_valuable(rth)))
 664                 goto out;
 665         ret = 1;
 666 out:    return ret;
 667 }
 668
 669 /* Bits of score are:
 670  * 31: very valuable
 671  * 30: not quite useless
 672  * 29..0: usage counter
 673  */
 674 static inline u32 rt_score(struct rtable *rt)
 675 {
 676         u32 score = jiffies - rt->dst.lastuse;
 677
 678         score = ~score & ~(3<<30);
 679
 680         if (rt_valuable(rt))
 681                 score |= (1<<31);
 682
 683         if (rt_is_output_route(rt) ||
 684             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 685                 score |= (1<<30);
 686
 687         return score;
 688 }
 689
 690 static inline bool rt_caching(const struct net *net)
 691 {
 692         return net->ipv4.current_rt_cache_rebuild_count <=
 693                 net->ipv4.sysctl_rt_cache_rebuild_count;
 694 }
 695
 696 static inline bool compare_hash_inputs(const struct rtable *rt1,
 697                                        const struct rtable *rt2)
 698 {
 699         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 700                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 701                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 702 }
 703
 704 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 707                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 708                 (rt1->rt_mark ^ rt2->rt_mark) |
 709                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 710                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 711                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 712 }
 713
 714 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 715 {
 716         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 717 }
 718
 719 static inline int rt_is_expired(struct rtable *rth)
 720 {
 721         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 722 }
 723
 724 /*
 725  * Perform a full scan of hash table and free all entries.
 726  * Can be called by a softirq or a process.
 727  * In the later case, we want to be reschedule if necessary
 728  */
 729 static void rt_do_flush(struct net *net, int process_context)
 730 {
 731         unsigned int i;
 732         struct rtable *rth, *next;
 733
 734         for (i = 0; i <= rt_hash_mask; i++) {
 735                 struct rtable __rcu **pprev;
 736                 struct rtable *list;
 737
 738                 if (process_context && need_resched())
 739                         cond_resched();
 740                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 741                 if (!rth)
 742                         continue;
 743
 744                 spin_lock_bh(rt_hash_lock_addr(i));
 745
 746                 list = NULL;
 747                 pprev = &rt_hash_table[i].chain;
 748                 rth = rcu_dereference_protected(*pprev,
 749                         lockdep_is_held(rt_hash_lock_addr(i)));
 750
 751                 while (rth) {
 752                         next = rcu_dereference_protected(rth->dst.rt_next,
 753                                 lockdep_is_held(rt_hash_lock_addr(i)));
 754
 755                         if (!net ||
 756                             net_eq(dev_net(rth->dst.dev), net)) {
 757                                 rcu_assign_pointer(*pprev, next);
 758                                 rcu_assign_pointer(rth->dst.rt_next, list);
 759                                 list = rth;
 760                         } else {
 761                                 pprev = &rth->dst.rt_next;
 762                         }
 763                         rth = next;
 764                 }
 765
 766                 spin_unlock_bh(rt_hash_lock_addr(i));
 767
 768                 for (; list; list = next) {
 769                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 770                         rt_free(list);
 771                 }
 772         }
 773 }
 774
 775 /*
 776  * While freeing expired entries, we compute average chain length
 777  * and standard deviation, using fixed-point arithmetic.
 778  * This to have an estimation of rt_chain_length_max
 779  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 780  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 781  */
 782
 783 #define FRACT_BITS 3
 784 #define ONE (1UL << FRACT_BITS)
 785
 786 /*
 787  * Given a hash chain and an item in this hash chain,
 788  * find if a previous entry has the same hash_inputs
 789  * (but differs on tos, mark or oif)
 790  * Returns 0 if an alias is found.
 791  * Returns ONE if rth has no alias before itself.
 792  */
 793 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 794 {
 795         const struct rtable *aux = head;
 796
 797         while (aux != rth) {
 798                 if (compare_hash_inputs(aux, rth))
 799                         return 0;
 800                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 801         }
 802         return ONE;
 803 }
 804
 805 static void rt_check_expire(void)
 806 {
 807         static unsigned int rover;
 808         unsigned int i = rover, goal;
 809         struct rtable *rth;
 810         struct rtable __rcu **rthp;
 811         unsigned long samples = 0;
 812         unsigned long sum = 0, sum2 = 0;
 813         unsigned long delta;
 814         u64 mult;
 815
 816         delta = jiffies - expires_ljiffies;
 817         expires_ljiffies = jiffies;
 818         mult = ((u64)delta) << rt_hash_log;
 819         if (ip_rt_gc_timeout > 1)
 820                 do_div(mult, ip_rt_gc_timeout);
 821         goal = (unsigned int)mult;
 822         if (goal > rt_hash_mask)
 823                 goal = rt_hash_mask + 1;
 824         for (; goal > 0; goal--) {
 825                 unsigned long tmo = ip_rt_gc_timeout;
 826                 unsigned long length;
 827
 828                 i = (i + 1) & rt_hash_mask;
 829                 rthp = &rt_hash_table[i].chain;
 830
 831                 if (need_resched())
 832                         cond_resched();
 833
 834                 samples++;
 835
 836                 if (rcu_dereference_raw(*rthp) == NULL)
 837                         continue;
 838                 length = 0;
 839                 spin_lock_bh(rt_hash_lock_addr(i));
 840                 while ((rth = rcu_dereference_protected(*rthp,
 841                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 842                         prefetch(rth->dst.rt_next);
 843                         if (rt_is_expired(rth) ||
 844                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 845                                 *rthp = rth->dst.rt_next;
 846                                 rt_free(rth);
 847                                 continue;
 848                         }
 849
 850                         /* We only count entries on a chain with equal
 851                          * hash inputs once so that entries for
 852                          * different QOS levels, and other non-hash
 853                          * input attributes don't unfairly skew the
 854                          * length computation
 855                          */
 856                         tmo >>= 1;
 857                         rthp = &rth->dst.rt_next;
 858                         length += has_noalias(rt_hash_table[i].chain, rth);
 859                 }
 860                 spin_unlock_bh(rt_hash_lock_addr(i));
 861                 sum += length;
 862                 sum2 += length*length;
 863         }
 864         if (samples) {
 865                 unsigned long avg = sum / samples;
 866                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 867                 rt_chain_length_max = max_t(unsigned long,
 868                                         ip_rt_gc_elasticity,
 869                                         (avg + 4*sd) >> FRACT_BITS);
 870         }
 871         rover = i;
 872 }
 873
 874 /*
 875  * rt_worker_func() is run in process context.
 876  * we call rt_check_expire() to scan part of the hash table
 877  */
 878 static void rt_worker_func(struct work_struct *work)
 879 {
 880         rt_check_expire();
 881         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 882 }
 883
 884 /*
 885  * Perturbation of rt_genid by a small quantity [1..256]
 886  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 887  * many times (2^24) without giving recent rt_genid.
 888  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 889  */
 890 static void rt_cache_invalidate(struct net *net)
 891 {
 892         unsigned char shuffle;
 893
 894         get_random_bytes(&shuffle, sizeof(shuffle));
 895         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 896 }
 897
 898 /*
 899  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 900  * delay >= 0 : invalidate & flush cache (can be long)
 901  */
 902 void rt_cache_flush(struct net *net, int delay)
 903 {
 904         rt_cache_invalidate(net);
 905         if (delay >= 0)
 906                 rt_do_flush(net, !in_softirq());
 907 }
 908
 909 /* Flush previous cache invalidated entries from the cache */
 910 void rt_cache_flush_batch(struct net *net)
 911 {
 912         rt_do_flush(net, !in_softirq());
 913 }
 914
 915 static void rt_emergency_hash_rebuild(struct net *net)
 916 {
 917         net_warn_ratelimited("Route hash chain too long!\n");
 918         rt_cache_invalidate(net);
 919 }
 920
 921 /*
 922    Short description of GC goals.
 923
 924    We want to build algorithm, which will keep routing cache
 925    at some equilibrium point, when number of aged off entries
 926    is kept approximately equal to newly generated ones.
 927
 928    Current expiration strength is variable "expire".
 929    We try to adjust it dynamically, so that if networking
 930    is idle expires is large enough to keep enough of warm entries,
 931    and when load increases it reduces to limit cache size.
 932  */
 933
 934 static int rt_garbage_collect(struct dst_ops *ops)
 935 {
 936         static unsigned long expire = RT_GC_TIMEOUT;
 937         static unsigned long last_gc;
 938         static int rover;
 939         static int equilibrium;
 940         struct rtable *rth;
 941         struct rtable __rcu **rthp;
 942         unsigned long now = jiffies;
 943         int goal;
 944         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 945
 946         /*
 947          * Garbage collection is pretty expensive,
 948          * do not make it too frequently.
 949          */
 950
 951         RT_CACHE_STAT_INC(gc_total);
 952
 953         if (now - last_gc < ip_rt_gc_min_interval &&
 954             entries < ip_rt_max_size) {
 955                 RT_CACHE_STAT_INC(gc_ignored);
 956                 goto out;
 957         }
 958
 959         entries = dst_entries_get_slow(&ipv4_dst_ops);
 960         /* Calculate number of entries, which we want to expire now. */
 961         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 962         if (goal <= 0) {
 963                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 964                         equilibrium = ipv4_dst_ops.gc_thresh;
 965                 goal = entries - equilibrium;
 966                 if (goal > 0) {
 967                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 968                         goal = entries - equilibrium;
 969                 }
 970         } else {
 971                 /* We are in dangerous area. Try to reduce cache really
 972                  * aggressively.
 973                  */
 974                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 975                 equilibrium = entries - goal;
 976         }
 977
 978         if (now - last_gc >= ip_rt_gc_min_interval)
 979                 last_gc = now;
 980
 981         if (goal <= 0) {
 982                 equilibrium += goal;
 983                 goto work_done;
 984         }
 985
 986         do {
 987                 int i, k;
 988
 989                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 990                         unsigned long tmo = expire;
 991
 992                         k = (k + 1) & rt_hash_mask;
 993                         rthp = &rt_hash_table[k].chain;
 994                         spin_lock_bh(rt_hash_lock_addr(k));
 995                         while ((rth = rcu_dereference_protected(*rthp,
 996                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 997                                 if (!rt_is_expired(rth) &&
 998                                         !rt_may_expire(rth, tmo, expire)) {
 999                                         tmo >>= 1;
1000                                         rthp = &rth->dst.rt_next;
1001                                         continue;
1002                                 }
1003                                 *rthp = rth->dst.rt_next;
1004                                 rt_free(rth);
1005                                 goal--;
1006                         }
1007                         spin_unlock_bh(rt_hash_lock_addr(k));
1008                         if (goal <= 0)
1009                                 break;
1010                 }
1011                 rover = k;
1012
1013                 if (goal <= 0)
1014                         goto work_done;
1015
1016                 /* Goal is not achieved. We stop process if:
1017
1018                    - if expire reduced to zero. Otherwise, expire is halfed.
1019                    - if table is not full.
1020                    - if we are called from interrupt.
1021                    - jiffies check is just fallback/debug loop breaker.
1022                      We will not spin here for long time in any case.
1023                  */
1024
1025                 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027                 if (expire == 0)
1028                         break;
1029
1030                 expire >>= 1;
1031
1032                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                         goto out;
1034         } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037                 goto out;
1038         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039                 goto out;
1040         net_warn_ratelimited("dst cache overflow\n");
1041         RT_CACHE_STAT_INC(gc_dst_overflow);
1042         return 1;
1043
1044 work_done:
1045         expire += ip_rt_gc_min_interval;
1046         if (expire > ip_rt_gc_timeout ||
1047             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049                 expire = ip_rt_gc_timeout;
1050 out:    return 0;
1051 }
1052
1053 /*
1054  * Returns number of entries in a hash chain that have different hash_inputs
1055  */
1056 static int slow_chain_length(const struct rtable *head)
1057 {
1058         int length = 0;
1059         const struct rtable *rth = head;
1060
1061         while (rth) {
1062                 length += has_noalias(head, rth);
1063                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064         }
1065         return length >> FRACT_BITS;
1066 }
1067
1068 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069                                            struct sk_buff *skb,
1070                                            const void *daddr)
1071 {
1072         struct net_device *dev = dst->dev;
1073         const __be32 *pkey = daddr;
1074         const struct rtable *rt;
1075         struct neighbour *n;
1076
1077         rt = (const struct rtable *) dst;
1078         if (rt->rt_gateway)
1079                 pkey = (const __be32 *) &rt->rt_gateway;
1080         else if (skb)
1081                 pkey = &ip_hdr(skb)->daddr;
1082
1083         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1084         if (n)
1085                 return n;
1086         return neigh_create(&arp_tbl, pkey, dev);
1087 }
1088
1089 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090                                      struct sk_buff *skb, int ifindex)
1091 {
1092         struct rtable   *rth, *cand;
1093         struct rtable __rcu **rthp, **candp;
1094         unsigned long   now;
1095         u32             min_score;
1096         int             chain_length;
1097
1098 restart:
1099         chain_length = 0;
1100         min_score = ~(u32)0;
1101         cand = NULL;
1102         candp = NULL;
1103         now = jiffies;
1104
1105         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106                 /*
1107                  * If we're not caching, just tell the caller we
1108                  * were successful and don't touch the route.  The
1109                  * caller hold the sole reference to the cache entry, and
1110                  * it will be released when the caller is done with it.
1111                  * If we drop it here, the callers have no way to resolve routes
1112                  * when we're not caching.  Instead, just point *rp at rt, so
1113                  * the caller gets a single use out of the route
1114                  * Note that we do rt_free on this new route entry, so that
1115                  * once its refcount hits zero, we are still able to reap it
1116                  * (Thanks Alexey)
1117                  * Note: To avoid expensive rcu stuff for this uncached dst,
1118                  * we set DST_NOCACHE so that dst_release() can free dst without
1119                  * waiting a grace period.
1120                  */
1121
1122                 rt->dst.flags |= DST_NOCACHE;
1123                 goto skip_hashing;
1124         }
1125
1126         rthp = &rt_hash_table[hash].chain;
1127
1128         spin_lock_bh(rt_hash_lock_addr(hash));
1129         while ((rth = rcu_dereference_protected(*rthp,
1130                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131                 if (rt_is_expired(rth)) {
1132                         *rthp = rth->dst.rt_next;
1133                         rt_free(rth);
1134                         continue;
1135                 }
1136                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137                         /* Put it first */
1138                         *rthp = rth->dst.rt_next;
1139                         /*
1140                          * Since lookup is lockfree, the deletion
1141                          * must be visible to another weakly ordered CPU before
1142                          * the insertion at the start of the hash chain.
1143                          */
1144                         rcu_assign_pointer(rth->dst.rt_next,
1145                                            rt_hash_table[hash].chain);
1146                         /*
1147                          * Since lookup is lockfree, the update writes
1148                          * must be ordered for consistency on SMP.
1149                          */
1150                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152                         dst_use(&rth->dst, now);
1153                         spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155                         rt_drop(rt);
1156                         if (skb)
1157                                 skb_dst_set(skb, &rth->dst);
1158                         return rth;
1159                 }
1160
1161                 if (!atomic_read(&rth->dst.__refcnt)) {
1162                         u32 score = rt_score(rth);
1163
1164                         if (score <= min_score) {
1165                                 cand = rth;
1166                                 candp = rthp;
1167                                 min_score = score;
1168                         }
1169                 }
1170
1171                 chain_length++;
1172
1173                 rthp = &rth->dst.rt_next;
1174         }
1175
1176         if (cand) {
1177                 /* ip_rt_gc_elasticity used to be average length of chain
1178                  * length, when exceeded gc becomes really aggressive.
1179                  *
1180                  * The second limit is less certain. At the moment it allows
1181                  * only 2 entries per bucket. We will see.
1182                  */
1183                 if (chain_length > ip_rt_gc_elasticity) {
1184                         *candp = cand->dst.rt_next;
1185                         rt_free(cand);
1186                 }
1187         } else {
1188                 if (chain_length > rt_chain_length_max &&
1189                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190                         struct net *net = dev_net(rt->dst.dev);
1191                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192                         if (!rt_caching(net)) {
1193                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194                                         rt->dst.dev->name, num);
1195                         }
1196                         rt_emergency_hash_rebuild(net);
1197                         spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200                                         ifindex, rt_genid(net));
1201                         goto restart;
1202                 }
1203         }
1204
1205         rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207         /*
1208          * Since lookup is lockfree, we must make sure
1209          * previous writes to rt are committed to memory
1210          * before making rt visible to other CPUS.
1211          */
1212         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214         spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216 skip_hashing:
1217         if (skb)
1218                 skb_dst_set(skb, &rt->dst);
1219         return rt;
1220 }
1221
1222 /*
1223  * Peer allocation may fail only in serious out-of-memory conditions.  However
1224  * we still can generate some output.
1225  * Random ID selection looks a bit dangerous because we have no chances to
1226  * select ID being unique in a reasonable period of time.
1227  * But broken packet identifier may be better than no packet at all.
1228  */
1229 static void ip_select_fb_ident(struct iphdr *iph)
1230 {
1231         static DEFINE_SPINLOCK(ip_fb_id_lock);
1232         static u32 ip_fallback_id;
1233         u32 salt;
1234
1235         spin_lock_bh(&ip_fb_id_lock);
1236         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1237         iph->id = htons(salt & 0xFFFF);
1238         ip_fallback_id = salt;
1239         spin_unlock_bh(&ip_fb_id_lock);
1240 }
1241
1242 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1243 {
1244         struct net *net = dev_net(dst->dev);
1245         struct inet_peer *peer;
1246
1247         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1248         if (peer) {
1249                 iph->id = htons(inet_getid(peer, more));
1250                 inet_putpeer(peer);
1251                 return;
1252         }
1253
1254         ip_select_fb_ident(iph);
1255 }
1256 EXPORT_SYMBOL(__ip_select_ident);
1257
1258 static void rt_del(unsigned int hash, struct rtable *rt)
1259 {
1260         struct rtable __rcu **rthp;
1261         struct rtable *aux;
1262
1263         rthp = &rt_hash_table[hash].chain;
1264         spin_lock_bh(rt_hash_lock_addr(hash));
1265         ip_rt_put(rt);
1266         while ((aux = rcu_dereference_protected(*rthp,
1267                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268                 if (aux == rt || rt_is_expired(aux)) {
1269                         *rthp = aux->dst.rt_next;
1270                         rt_free(aux);
1271                         continue;
1272                 }
1273                 rthp = &aux->dst.rt_next;
1274         }
1275         spin_unlock_bh(rt_hash_lock_addr(hash));
1276 }
1277
1278 static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
1279                              const struct iphdr *iph,
1280                              int oif, u8 tos,
1281                              u8 prot, u32 mark, int flow_flags)
1282 {
1283         if (sk) {
1284                 const struct inet_sock *inet = inet_sk(sk);
1285
1286                 oif = sk->sk_bound_dev_if;
1287                 mark = sk->sk_mark;
1288                 tos = RT_CONN_FLAGS(sk);
1289                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1290         }
1291         flowi4_init_output(fl4, oif, mark, tos,
1292                            RT_SCOPE_UNIVERSE, prot,
1293                            flow_flags,
1294                            iph->daddr, iph->saddr, 0, 0);
1295 }
1296
1297 static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
1298 {
1299         const struct iphdr *iph = ip_hdr(skb);
1300         int oif = skb->dev->ifindex;
1301         u8 tos = RT_TOS(iph->tos);
1302         u8 prot = iph->protocol;
1303         u32 mark = skb->mark;
1304
1305         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1306 }
1307
1308 static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
1309 {
1310         const struct inet_sock *inet = inet_sk(sk);
1311         struct ip_options_rcu *inet_opt;
1312         __be32 daddr = inet->inet_daddr;
1313
1314         rcu_read_lock();
1315         inet_opt = rcu_dereference(inet->inet_opt);
1316         if (inet_opt && inet_opt->opt.srr)
1317                 daddr = inet_opt->opt.faddr;
1318         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1319                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1320                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1321                            inet_sk_flowi_flags(sk),
1322                            daddr, inet->inet_saddr, 0, 0);
1323         rcu_read_unlock();
1324 }
1325
1326 static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
1327                                  struct sk_buff *skb)
1328 {
1329         if (skb)
1330                 build_skb_flow_key(fl4, skb, sk);
1331         else
1332                 build_sk_flow_key(fl4, sk);
1333 }
1334
1335 static DEFINE_SPINLOCK(fnhe_lock);
1336
1337 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
1338 {
1339         struct fib_nh_exception *fnhe, *oldest;
1340
1341         oldest = rcu_dereference(hash->chain);
1342         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1343              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1344                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1345                         oldest = fnhe;
1346         }
1347         return oldest;
1348 }
1349
1350 static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
1351 {
1352         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1353         struct fib_nh_exception *fnhe;
1354         int depth;
1355         u32 hval;
1356
1357         if (!hash) {
1358                 hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
1359                                                    GFP_ATOMIC);
1360                 if (!hash)
1361                         return NULL;
1362         }
1363
1364         hval = (__force u32) daddr;
1365         hval ^= (hval >> 11) ^ (hval >> 22);
1366         hash += hval;
1367
1368         depth = 0;
1369         for (fnhe = rcu_dereference(hash->chain); fnhe;
1370              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1371                 if (fnhe->fnhe_daddr == daddr)
1372                         goto out;
1373                 depth++;
1374         }
1375
1376         if (depth > FNHE_RECLAIM_DEPTH) {
1377                 fnhe = fnhe_oldest(hash + hval, daddr);
1378                 goto out_daddr;
1379         }
1380         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1381         if (!fnhe)
1382                 return NULL;
1383
1384         fnhe->fnhe_next = hash->chain;
1385         rcu_assign_pointer(hash->chain, fnhe);
1386
1387 out_daddr:
1388         fnhe->fnhe_daddr = daddr;
1389 out:
1390         fnhe->fnhe_stamp = jiffies;
1391         return fnhe;
1392 }
1393
1394 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
1395 {
1396         __be32 new_gw = icmp_hdr(skb)->un.gateway;
1397         __be32 old_gw = ip_hdr(skb)->saddr;
1398         struct net_device *dev = skb->dev;
1399         struct in_device *in_dev;
1400         struct fib_result res;
1401         struct neighbour *n;
1402         struct net *net;
1403
1404         switch (icmp_hdr(skb)->code & 7) {
1405         case ICMP_REDIR_NET:
1406         case ICMP_REDIR_NETTOS:
1407         case ICMP_REDIR_HOST:
1408         case ICMP_REDIR_HOSTTOS:
1409                 break;
1410
1411         default:
1412                 return;
1413         }
1414
1415         if (rt->rt_gateway != old_gw)
1416                 return;
1417
1418         in_dev = __in_dev_get_rcu(dev);
1419         if (!in_dev)
1420                 return;
1421
1422         net = dev_net(dev);
1423         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1424             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1425             ipv4_is_zeronet(new_gw))
1426                 goto reject_redirect;
1427
1428         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1429                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1430                         goto reject_redirect;
1431                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1432                         goto reject_redirect;
1433         } else {
1434                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1435                         goto reject_redirect;
1436         }
1437
1438         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1439         if (n) {
1440                 if (!(n->nud_state & NUD_VALID)) {
1441                         neigh_event_send(n, NULL);
1442                 } else {
1443                         if (fib_lookup(net, fl4, &res) == 0) {
1444                                 struct fib_nh *nh = &FIB_RES_NH(res);
1445                                 struct fib_nh_exception *fnhe;
1446
1447                                 spin_lock_bh(&fnhe_lock);
1448                                 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1449                                 if (fnhe)
1450                                         fnhe->fnhe_gw = new_gw;
1451                                 spin_unlock_bh(&fnhe_lock);
1452                         }
1453                         rt->rt_gateway = new_gw;
1454                         rt->rt_flags |= RTCF_REDIRECTED;
1455                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1456                 }
1457                 neigh_release(n);
1458         }
1459         return;
1460
1461 reject_redirect:
1462 #ifdef CONFIG_IP_ROUTE_VERBOSE
1463         if (IN_DEV_LOG_MARTIANS(in_dev)) {
1464                 const struct iphdr *iph = (const struct iphdr *) skb->data;
1465                 __be32 daddr = iph->daddr;
1466                 __be32 saddr = iph->saddr;
1467
1468                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1469                                      "  Advised path = %pI4 -> %pI4\n",
1470                                      &old_gw, dev->name, &new_gw,
1471                                      &saddr, &daddr);
1472         }
1473 #endif
1474         ;
1475 }
1476
1477 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1478 {
1479         struct rtable *rt;
1480         struct flowi4 fl4;
1481
1482         rt = (struct rtable *) dst;
1483
1484         ip_rt_build_flow_key(&fl4, sk, skb);
1485         __ip_do_redirect(rt, skb, &fl4);
1486 }
1487
1488 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1489 {
1490         struct rtable *rt = (struct rtable *)dst;
1491         struct dst_entry *ret = dst;
1492
1493         if (rt) {
1494                 if (dst->obsolete > 0) {
1495                         ip_rt_put(rt);
1496                         ret = NULL;
1497                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1498                            rt->dst.expires) {
1499                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1500                                                 rt->rt_oif,
1501                                                 rt_genid(dev_net(dst->dev)));
1502                         rt_del(hash, rt);
1503                         ret = NULL;
1504                 }
1505         }
1506         return ret;
1507 }
1508
1509 /*
1510  * Algorithm:
1511  *      1. The first ip_rt_redirect_number redirects are sent
1512  *         with exponential backoff, then we stop sending them at all,
1513  *         assuming that the host ignores our redirects.
1514  *      2. If we did not see packets requiring redirects
1515  *         during ip_rt_redirect_silence, we assume that the host
1516  *         forgot redirected route and start to send redirects again.
1517  *
1518  * This algorithm is much cheaper and more intelligent than dumb load limiting
1519  * in icmp.c.
1520  *
1521  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1522  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1523  */
1524
1525 void ip_rt_send_redirect(struct sk_buff *skb)
1526 {
1527         struct rtable *rt = skb_rtable(skb);
1528         struct in_device *in_dev;
1529         struct inet_peer *peer;
1530         struct net *net;
1531         int log_martians;
1532
1533         rcu_read_lock();
1534         in_dev = __in_dev_get_rcu(rt->dst.dev);
1535         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1536                 rcu_read_unlock();
1537                 return;
1538         }
1539         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1540         rcu_read_unlock();
1541
1542         net = dev_net(rt->dst.dev);
1543         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1544         if (!peer) {
1545                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1546                 return;
1547         }
1548
1549         /* No redirected packets during ip_rt_redirect_silence;
1550          * reset the algorithm.
1551          */
1552         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1553                 peer->rate_tokens = 0;
1554
1555         /* Too many ignored redirects; do not send anything
1556          * set dst.rate_last to the last seen redirected packet.
1557          */
1558         if (peer->rate_tokens >= ip_rt_redirect_number) {
1559                 peer->rate_last = jiffies;
1560                 goto out_put_peer;
1561         }
1562
1563         /* Check for load limit; set rate_last to the latest sent
1564          * redirect.
1565          */
1566         if (peer->rate_tokens == 0 ||
1567             time_after(jiffies,
1568                        (peer->rate_last +
1569                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1570                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1571                 peer->rate_last = jiffies;
1572                 ++peer->rate_tokens;
1573 #ifdef CONFIG_IP_ROUTE_VERBOSE
1574                 if (log_martians &&
1575                     peer->rate_tokens == ip_rt_redirect_number)
1576                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1577                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1578                                              &rt->rt_dst, &rt->rt_gateway);
1579 #endif
1580         }
1581 out_put_peer:
1582         inet_putpeer(peer);
1583 }
1584
1585 static int ip_error(struct sk_buff *skb)
1586 {
1587         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1588         struct rtable *rt = skb_rtable(skb);
1589         struct inet_peer *peer;
1590         unsigned long now;
1591         struct net *net;
1592         bool send;
1593         int code;
1594
1595         net = dev_net(rt->dst.dev);
1596         if (!IN_DEV_FORWARD(in_dev)) {
1597                 switch (rt->dst.error) {
1598                 case EHOSTUNREACH:
1599                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1600                         break;
1601
1602                 case ENETUNREACH:
1603                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1604                         break;
1605                 }
1606                 goto out;
1607         }
1608
1609         switch (rt->dst.error) {
1610         case EINVAL:
1611         default:
1612                 goto out;
1613         case EHOSTUNREACH:
1614                 code = ICMP_HOST_UNREACH;
1615                 break;
1616         case ENETUNREACH:
1617                 code = ICMP_NET_UNREACH;
1618                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1619                 break;
1620         case EACCES:
1621                 code = ICMP_PKT_FILTERED;
1622                 break;
1623         }
1624
1625         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1626
1627         send = true;
1628         if (peer) {
1629                 now = jiffies;
1630                 peer->rate_tokens += now - peer->rate_last;
1631                 if (peer->rate_tokens > ip_rt_error_burst)
1632                         peer->rate_tokens = ip_rt_error_burst;
1633                 peer->rate_last = now;
1634                 if (peer->rate_tokens >= ip_rt_error_cost)
1635                         peer->rate_tokens -= ip_rt_error_cost;
1636                 else
1637                         send = false;
1638                 inet_putpeer(peer);
1639         }
1640         if (send)
1641                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1642
1643 out:    kfree_skb(skb);
1644         return 0;
1645 }
1646
1647 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1648 {
1649         struct fib_result res;
1650
1651         if (mtu < ip_rt_min_pmtu)
1652                 mtu = ip_rt_min_pmtu;
1653
1654         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1655                 struct fib_nh *nh = &FIB_RES_NH(res);
1656                 struct fib_nh_exception *fnhe;
1657
1658                 spin_lock_bh(&fnhe_lock);
1659                 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1660                 if (fnhe) {
1661                         fnhe->fnhe_pmtu = mtu;
1662                         fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
1663                 }
1664                 spin_unlock_bh(&fnhe_lock);
1665         }
1666         rt->rt_pmtu = mtu;
1667         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1668 }
1669
1670 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1671                               struct sk_buff *skb, u32 mtu)
1672 {
1673         struct rtable *rt = (struct rtable *) dst;
1674         struct flowi4 fl4;
1675
1676         ip_rt_build_flow_key(&fl4, sk, skb);
1677         __ip_rt_update_pmtu(rt, &fl4, mtu);
1678 }
1679
1680 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1681                       int oif, u32 mark, u8 protocol, int flow_flags)
1682 {
1683         const struct iphdr *iph = (const struct iphdr *) skb->data;
1684         struct flowi4 fl4;
1685         struct rtable *rt;
1686
1687         __build_flow_key(&fl4, NULL, iph, oif,
1688                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1689         rt = __ip_route_output_key(net, &fl4);
1690         if (!IS_ERR(rt)) {
1691                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1692                 ip_rt_put(rt);
1693         }
1694 }
1695 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1696
1697 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1698 {
1699         const struct iphdr *iph = (const struct iphdr *) skb->data;
1700         struct flowi4 fl4;
1701         struct rtable *rt;
1702
1703         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1704         rt = __ip_route_output_key(sock_net(sk), &fl4);
1705         if (!IS_ERR(rt)) {
1706                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1707                 ip_rt_put(rt);
1708         }
1709 }
1710 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1711
1712 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1713                    int oif, u32 mark, u8 protocol, int flow_flags)
1714 {
1715         const struct iphdr *iph = (const struct iphdr *) skb->data;
1716         struct flowi4 fl4;
1717         struct rtable *rt;
1718
1719         __build_flow_key(&fl4, NULL, iph, oif,
1720                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1721         rt = __ip_route_output_key(net, &fl4);
1722         if (!IS_ERR(rt)) {
1723                 __ip_do_redirect(rt, skb, &fl4);
1724                 ip_rt_put(rt);
1725         }
1726 }
1727 EXPORT_SYMBOL_GPL(ipv4_redirect);
1728
1729 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1730 {
1731         const struct iphdr *iph = (const struct iphdr *) skb->data;
1732         struct flowi4 fl4;
1733         struct rtable *rt;
1734
1735         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1736         rt = __ip_route_output_key(sock_net(sk), &fl4);
1737         if (!IS_ERR(rt)) {
1738                 __ip_do_redirect(rt, skb, &fl4);
1739                 ip_rt_put(rt);
1740         }
1741 }
1742 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1743
1744 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1745 {
1746         struct rtable *rt = (struct rtable *) dst;
1747
1748         if (rt_is_expired(rt))
1749                 return NULL;
1750         return dst;
1751 }
1752
1753 static void ipv4_dst_destroy(struct dst_entry *dst)
1754 {
1755         struct rtable *rt = (struct rtable *) dst;
1756
1757         if (rt->fi) {
1758                 fib_info_put(rt->fi);
1759                 rt->fi = NULL;
1760         }
1761 }
1762
1763
1764 static void ipv4_link_failure(struct sk_buff *skb)
1765 {
1766         struct rtable *rt;
1767
1768         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1769
1770         rt = skb_rtable(skb);
1771         if (rt)
1772                 dst_set_expires(&rt->dst, 0);
1773 }
1774
1775 static int ip_rt_bug(struct sk_buff *skb)
1776 {
1777         pr_debug("%s: %pI4 -> %pI4, %s\n",
1778                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1779                  skb->dev ? skb->dev->name : "?");
1780         kfree_skb(skb);
1781         WARN_ON(1);
1782         return 0;
1783 }
1784
1785 /*
1786    We do not cache source address of outgoing interface,
1787    because it is used only by IP RR, TS and SRR options,
1788    so that it out of fast path.
1789
1790    BTW remember: "addr" is allowed to be not aligned
1791    in IP options!
1792  */
1793
1794 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1795 {
1796         __be32 src;
1797
1798         if (rt_is_output_route(rt))
1799                 src = ip_hdr(skb)->saddr;
1800         else {
1801                 struct fib_result res;
1802                 struct flowi4 fl4;
1803                 struct iphdr *iph;
1804
1805                 iph = ip_hdr(skb);
1806
1807                 memset(&fl4, 0, sizeof(fl4));
1808                 fl4.daddr = iph->daddr;
1809                 fl4.saddr = iph->saddr;
1810                 fl4.flowi4_tos = RT_TOS(iph->tos);
1811                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1812                 fl4.flowi4_iif = skb->dev->ifindex;
1813                 fl4.flowi4_mark = skb->mark;
1814
1815                 rcu_read_lock();
1816                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1817                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1818                 else
1819                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1820                                         RT_SCOPE_UNIVERSE);
1821                 rcu_read_unlock();
1822         }
1823         memcpy(addr, &src, 4);
1824 }
1825
1826 #ifdef CONFIG_IP_ROUTE_CLASSID
1827 static void set_class_tag(struct rtable *rt, u32 tag)
1828 {
1829         if (!(rt->dst.tclassid & 0xFFFF))
1830                 rt->dst.tclassid |= tag & 0xFFFF;
1831         if (!(rt->dst.tclassid & 0xFFFF0000))
1832                 rt->dst.tclassid |= tag & 0xFFFF0000;
1833 }
1834 #endif
1835
1836 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1837 {
1838         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1839
1840         if (advmss == 0) {
1841                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1842                                ip_rt_min_advmss);
1843                 if (advmss > 65535 - 40)
1844                         advmss = 65535 - 40;
1845         }
1846         return advmss;
1847 }
1848
1849 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1850 {
1851         const struct rtable *rt = (const struct rtable *) dst;
1852         unsigned int mtu = rt->rt_pmtu;
1853
1854         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1855                 mtu = 0;
1856
1857         if (!mtu)
1858                 mtu = dst_metric_raw(dst, RTAX_MTU);
1859
1860         if (mtu && rt_is_output_route(rt))
1861                 return mtu;
1862
1863         mtu = dst->dev->mtu;
1864
1865         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1866
1867                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1868                         mtu = 576;
1869         }
1870
1871         if (mtu > IP_MAX_MTU)
1872                 mtu = IP_MAX_MTU;
1873
1874         return mtu;
1875 }
1876
1877 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1878                             struct fib_info *fi)
1879 {
1880         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1881                 rt->fi = fi;
1882                 atomic_inc(&fi->fib_clntref);
1883         }
1884         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1885 }
1886
1887 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1888 {
1889         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1890         struct fib_nh_exception *fnhe;
1891         u32 hval;
1892
1893         hval = (__force u32) daddr;
1894         hval ^= (hval >> 11) ^ (hval >> 22);
1895
1896         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1897              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1898                 if (fnhe->fnhe_daddr == daddr) {
1899                         if (fnhe->fnhe_pmtu) {
1900                                 unsigned long expires = fnhe->fnhe_expires;
1901                                 unsigned long diff = jiffies - expires;
1902
1903                                 if (time_before(jiffies, expires)) {
1904                                         rt->rt_pmtu = fnhe->fnhe_pmtu;
1905                                         dst_set_expires(&rt->dst, diff);
1906                                 }
1907                         }
1908                         if (fnhe->fnhe_gw)
1909                                 rt->rt_gateway = fnhe->fnhe_gw;
1910                         fnhe->fnhe_stamp = jiffies;
1911                         break;
1912                 }
1913         }
1914 }
1915
1916 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1917                            const struct fib_result *res,
1918                            struct fib_info *fi, u16 type, u32 itag)
1919 {
1920         if (fi) {
1921                 struct fib_nh *nh = &FIB_RES_NH(*res);
1922
1923                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1924                         rt->rt_gateway = nh->nh_gw;
1925                 if (unlikely(nh->nh_exceptions))
1926                         rt_bind_exception(rt, nh, fl4->daddr);
1927                 rt_init_metrics(rt, fl4, fi);
1928 #ifdef CONFIG_IP_ROUTE_CLASSID
1929                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1930 #endif
1931         }
1932
1933 #ifdef CONFIG_IP_ROUTE_CLASSID
1934 #ifdef CONFIG_IP_MULTIPLE_TABLES
1935         set_class_tag(rt, res->tclassid);
1936 #endif
1937         set_class_tag(rt, itag);
1938 #endif
1939 }
1940
1941 static struct rtable *rt_dst_alloc(struct net_device *dev,
1942                                    bool nopolicy, bool noxfrm)
1943 {
1944         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1945                          DST_HOST |
1946                          (nopolicy ? DST_NOPOLICY : 0) |
1947                          (noxfrm ? DST_NOXFRM : 0));
1948 }
1949
1950 /* called in rcu_read_lock() section */
1951 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1952                                 u8 tos, struct net_device *dev, int our)
1953 {
1954         unsigned int hash;
1955         struct rtable *rth;
1956         struct in_device *in_dev = __in_dev_get_rcu(dev);
1957         u32 itag = 0;
1958         int err;
1959
1960         /* Primary sanity checks. */
1961
1962         if (in_dev == NULL)
1963                 return -EINVAL;
1964
1965         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1966             skb->protocol != htons(ETH_P_IP))
1967                 goto e_inval;
1968
1969         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1970                 if (ipv4_is_loopback(saddr))
1971                         goto e_inval;
1972
1973         if (ipv4_is_zeronet(saddr)) {
1974                 if (!ipv4_is_local_multicast(daddr))
1975                         goto e_inval;
1976         } else {
1977                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1978                                           in_dev, &itag);
1979                 if (err < 0)
1980                         goto e_err;
1981         }
1982         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1983                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1984         if (!rth)
1985                 goto e_nobufs;
1986
1987 #ifdef CONFIG_IP_ROUTE_CLASSID
1988         rth->dst.tclassid = itag;
1989 #endif
1990         rth->dst.output = ip_rt_bug;
1991
1992         rth->rt_key_dst = daddr;
1993         rth->rt_key_src = saddr;
1994         rth->rt_genid   = rt_genid(dev_net(dev));
1995         rth->rt_flags   = RTCF_MULTICAST;
1996         rth->rt_type    = RTN_MULTICAST;
1997         rth->rt_key_tos = tos;
1998         rth->rt_dst     = daddr;
1999         rth->rt_src     = saddr;
2000         rth->rt_route_iif = dev->ifindex;
2001         rth->rt_iif     = dev->ifindex;
2002         rth->rt_oif     = 0;
2003         rth->rt_mark    = skb->mark;
2004         rth->rt_pmtu    = 0;
2005         rth->rt_gateway = daddr;
2006         rth->fi = NULL;
2007         if (our) {
2008                 rth->dst.input= ip_local_deliver;
2009                 rth->rt_flags |= RTCF_LOCAL;
2010         }
2011
2012 #ifdef CONFIG_IP_MROUTE
2013         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2014                 rth->dst.input = ip_mr_input;
2015 #endif
2016         RT_CACHE_STAT_INC(in_slow_mc);
2017
2018         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2019         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2020         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2021
2022 e_nobufs:
2023         return -ENOBUFS;
2024 e_inval:
2025         return -EINVAL;
2026 e_err:
2027         return err;
2028 }
2029
2030
2031 static void ip_handle_martian_source(struct net_device *dev,
2032                                      struct in_device *in_dev,
2033                                      struct sk_buff *skb,
2034                                      __be32 daddr,
2035                                      __be32 saddr)
2036 {
2037         RT_CACHE_STAT_INC(in_martian_src);
2038 #ifdef CONFIG_IP_ROUTE_VERBOSE
2039         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2040                 /*
2041                  *      RFC1812 recommendation, if source is martian,
2042                  *      the only hint is MAC header.
2043                  */
2044                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2045                         &daddr, &saddr, dev->name);
2046                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2047                         print_hex_dump(KERN_WARNING, "ll header: ",
2048                                        DUMP_PREFIX_OFFSET, 16, 1,
2049                                        skb_mac_header(skb),
2050                                        dev->hard_header_len, true);
2051                 }
2052         }
2053 #endif
2054 }
2055
2056 /* called in rcu_read_lock() section */
2057 static int __mkroute_input(struct sk_buff *skb,
2058                            const struct fib_result *res,
2059                            struct in_device *in_dev,
2060                            __be32 daddr, __be32 saddr, u32 tos,
2061                            struct rtable **result)
2062 {
2063         struct rtable *rth;
2064         int err;
2065         struct in_device *out_dev;
2066         unsigned int flags = 0;
2067         u32 itag;
2068
2069         /* get a working reference to the output device */
2070         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2071         if (out_dev == NULL) {
2072                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2073                 return -EINVAL;
2074         }
2075
2076
2077         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2078                                   in_dev->dev, in_dev, &itag);
2079         if (err < 0) {
2080                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2081                                          saddr);
2082
2083                 goto cleanup;
2084         }
2085
2086         if (err)
2087                 flags |= RTCF_DIRECTSRC;
2088
2089         if (out_dev == in_dev && err &&
2090             (IN_DEV_SHARED_MEDIA(out_dev) ||
2091              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2092                 flags |= RTCF_DOREDIRECT;
2093
2094         if (skb->protocol != htons(ETH_P_IP)) {
2095                 /* Not IP (i.e. ARP). Do not create route, if it is
2096                  * invalid for proxy arp. DNAT routes are always valid.
2097                  *
2098                  * Proxy arp feature have been extended to allow, ARP
2099                  * replies back to the same interface, to support
2100                  * Private VLAN switch technologies. See arp.c.
2101                  */
2102                 if (out_dev == in_dev &&
2103                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2104                         err = -EINVAL;
2105                         goto cleanup;
2106                 }
2107         }
2108
2109         rth = rt_dst_alloc(out_dev->dev,
2110                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2111                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2112         if (!rth) {
2113                 err = -ENOBUFS;
2114                 goto cleanup;
2115         }
2116
2117         rth->rt_key_dst = daddr;
2118         rth->rt_key_src = saddr;
2119         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2120         rth->rt_flags = flags;
2121         rth->rt_type = res->type;
2122         rth->rt_key_tos = tos;
2123         rth->rt_dst     = daddr;
2124         rth->rt_src     = saddr;
2125         rth->rt_route_iif = in_dev->dev->ifindex;
2126         rth->rt_iif     = in_dev->dev->ifindex;
2127         rth->rt_oif     = 0;
2128         rth->rt_mark    = skb->mark;
2129         rth->rt_pmtu    = 0;
2130         rth->rt_gateway = daddr;
2131         rth->fi = NULL;
2132
2133         rth->dst.input = ip_forward;
2134         rth->dst.output = ip_output;
2135
2136         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2137
2138         *result = rth;
2139         err = 0;
2140  cleanup:
2141         return err;
2142 }
2143
2144 static int ip_mkroute_input(struct sk_buff *skb,
2145                             struct fib_result *res,
2146                             const struct flowi4 *fl4,
2147                             struct in_device *in_dev,
2148                             __be32 daddr, __be32 saddr, u32 tos)
2149 {
2150         struct rtable *rth = NULL;
2151         int err;
2152         unsigned int hash;
2153
2154 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2155         if (res->fi && res->fi->fib_nhs > 1)
2156                 fib_select_multipath(res);
2157 #endif
2158
2159         /* create a routing cache entry */
2160         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2161         if (err)
2162                 return err;
2163
2164         /* put it into the cache */
2165         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2166                        rt_genid(dev_net(rth->dst.dev)));
2167         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2168         if (IS_ERR(rth))
2169                 return PTR_ERR(rth);
2170         return 0;
2171 }
2172
2173 /*
2174  *      NOTE. We drop all the packets that has local source
2175  *      addresses, because every properly looped back packet
2176  *      must have correct destination already attached by output routine.
2177  *
2178  *      Such approach solves two big problems:
2179  *      1. Not simplex devices are handled properly.
2180  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2181  *      called with rcu_read_lock()
2182  */
2183
2184 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2185                                u8 tos, struct net_device *dev)
2186 {
2187         struct fib_result res;
2188         struct in_device *in_dev = __in_dev_get_rcu(dev);
2189         struct flowi4   fl4;
2190         unsigned int    flags = 0;
2191         u32             itag = 0;
2192         struct rtable   *rth;
2193         unsigned int    hash;
2194         int             err = -EINVAL;
2195         struct net    *net = dev_net(dev);
2196
2197         /* IP on this device is disabled. */
2198
2199         if (!in_dev)
2200                 goto out;
2201
2202         /* Check for the most weird martians, which can be not detected
2203            by fib_lookup.
2204          */
2205
2206         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2207                 goto martian_source;
2208
2209         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2210                 goto brd_input;
2211
2212         /* Accept zero addresses only to limited broadcast;
2213          * I even do not know to fix it or not. Waiting for complains :-)
2214          */
2215         if (ipv4_is_zeronet(saddr))
2216                 goto martian_source;
2217
2218         if (ipv4_is_zeronet(daddr))
2219                 goto martian_destination;
2220
2221         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2222                 if (ipv4_is_loopback(daddr))
2223                         goto martian_destination;
2224
2225                 if (ipv4_is_loopback(saddr))
2226                         goto martian_source;
2227         }
2228
2229         /*
2230          *      Now we are ready to route packet.
2231          */
2232         fl4.flowi4_oif = 0;
2233         fl4.flowi4_iif = dev->ifindex;
2234         fl4.flowi4_mark = skb->mark;
2235         fl4.flowi4_tos = tos;
2236         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2237         fl4.daddr = daddr;
2238         fl4.saddr = saddr;
2239         err = fib_lookup(net, &fl4, &res);
2240         if (err != 0)
2241                 goto no_route;
2242
2243         RT_CACHE_STAT_INC(in_slow_tot);
2244
2245         if (res.type == RTN_BROADCAST)
2246                 goto brd_input;
2247
2248         if (res.type == RTN_LOCAL) {
2249                 err = fib_validate_source(skb, saddr, daddr, tos,
2250                                           net->loopback_dev->ifindex,
2251                                           dev, in_dev, &itag);
2252                 if (err < 0)
2253                         goto martian_source_keep_err;
2254                 if (err)
2255                         flags |= RTCF_DIRECTSRC;
2256                 goto local_input;
2257         }
2258
2259         if (!IN_DEV_FORWARD(in_dev))
2260                 goto no_route;
2261         if (res.type != RTN_UNICAST)
2262                 goto martian_destination;
2263
2264         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2265 out:    return err;
2266
2267 brd_input:
2268         if (skb->protocol != htons(ETH_P_IP))
2269                 goto e_inval;
2270
2271         if (!ipv4_is_zeronet(saddr)) {
2272                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2273                                           in_dev, &itag);
2274                 if (err < 0)
2275                         goto martian_source_keep_err;
2276                 if (err)
2277                         flags |= RTCF_DIRECTSRC;
2278         }
2279         flags |= RTCF_BROADCAST;
2280         res.type = RTN_BROADCAST;
2281         RT_CACHE_STAT_INC(in_brd);
2282
2283 local_input:
2284         rth = rt_dst_alloc(net->loopback_dev,
2285                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2286         if (!rth)
2287                 goto e_nobufs;
2288
2289         rth->dst.input= ip_local_deliver;
2290         rth->dst.output= ip_rt_bug;
2291 #ifdef CONFIG_IP_ROUTE_CLASSID
2292         rth->dst.tclassid = itag;
2293 #endif
2294
2295         rth->rt_key_dst = daddr;
2296         rth->rt_key_src = saddr;
2297         rth->rt_genid = rt_genid(net);
2298         rth->rt_flags   = flags|RTCF_LOCAL;
2299         rth->rt_type    = res.type;
2300         rth->rt_key_tos = tos;
2301         rth->rt_dst     = daddr;
2302         rth->rt_src     = saddr;
2303         rth->rt_route_iif = dev->ifindex;
2304         rth->rt_iif     = dev->ifindex;
2305         rth->rt_oif     = 0;
2306         rth->rt_mark    = skb->mark;
2307         rth->rt_pmtu    = 0;
2308         rth->rt_gateway = daddr;
2309         rth->fi = NULL;
2310         if (res.type == RTN_UNREACHABLE) {
2311                 rth->dst.input= ip_error;
2312                 rth->dst.error= -err;
2313                 rth->rt_flags   &= ~RTCF_LOCAL;
2314         }
2315         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2316         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2317         err = 0;
2318         if (IS_ERR(rth))
2319                 err = PTR_ERR(rth);
2320         goto out;
2321
2322 no_route:
2323         RT_CACHE_STAT_INC(in_no_route);
2324         res.type = RTN_UNREACHABLE;
2325         if (err == -ESRCH)
2326                 err = -ENETUNREACH;
2327         goto local_input;
2328
2329         /*
2330          *      Do not cache martian addresses: they should be logged (RFC1812)
2331          */
2332 martian_destination:
2333         RT_CACHE_STAT_INC(in_martian_dst);
2334 #ifdef CONFIG_IP_ROUTE_VERBOSE
2335         if (IN_DEV_LOG_MARTIANS(in_dev))
2336                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2337                                      &daddr, &saddr, dev->name);
2338 #endif
2339
2340 e_inval:
2341         err = -EINVAL;
2342         goto out;
2343
2344 e_nobufs:
2345         err = -ENOBUFS;
2346         goto out;
2347
2348 martian_source:
2349         err = -EINVAL;
2350 martian_source_keep_err:
2351         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2352         goto out;
2353 }
2354
2355 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2356                            u8 tos, struct net_device *dev, bool noref)
2357 {
2358         struct rtable   *rth;
2359         unsigned int    hash;
2360         int iif = dev->ifindex;
2361         struct net *net;
2362         int res;
2363
2364         net = dev_net(dev);
2365
2366         rcu_read_lock();
2367
2368         if (!rt_caching(net))
2369                 goto skip_cache;
2370
2371         tos &= IPTOS_RT_MASK;
2372         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2373
2374         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2375              rth = rcu_dereference(rth->dst.rt_next)) {
2376                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2377                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2378                      (rth->rt_route_iif ^ iif) |
2379                      (rth->rt_key_tos ^ tos)) == 0 &&
2380                     rth->rt_mark == skb->mark &&
2381                     net_eq(dev_net(rth->dst.dev), net) &&
2382                     !rt_is_expired(rth)) {
2383                         if (noref) {
2384                                 dst_use_noref(&rth->dst, jiffies);
2385                                 skb_dst_set_noref(skb, &rth->dst);
2386                         } else {
2387                                 dst_use(&rth->dst, jiffies);
2388                                 skb_dst_set(skb, &rth->dst);
2389                         }
2390                         RT_CACHE_STAT_INC(in_hit);
2391                         rcu_read_unlock();
2392                         return 0;
2393                 }
2394                 RT_CACHE_STAT_INC(in_hlist_search);
2395         }
2396
2397 skip_cache:
2398         /* Multicast recognition logic is moved from route cache to here.
2399            The problem was that too many Ethernet cards have broken/missing
2400            hardware multicast filters :-( As result the host on multicasting
2401            network acquires a lot of useless route cache entries, sort of
2402            SDR messages from all the world. Now we try to get rid of them.
2403            Really, provided software IP multicast filter is organized
2404            reasonably (at least, hashed), it does not result in a slowdown
2405            comparing with route cache reject entries.
2406            Note, that multicast routers are not affected, because
2407            route cache entry is created eventually.
2408          */
2409         if (ipv4_is_multicast(daddr)) {
2410                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2411
2412                 if (in_dev) {
2413                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2414                                                   ip_hdr(skb)->protocol);
2415                         if (our
2416 #ifdef CONFIG_IP_MROUTE
2417                                 ||
2418                             (!ipv4_is_local_multicast(daddr) &&
2419                              IN_DEV_MFORWARD(in_dev))
2420 #endif
2421                            ) {
2422                                 int res = ip_route_input_mc(skb, daddr, saddr,
2423                                                             tos, dev, our);
2424                                 rcu_read_unlock();
2425                                 return res;
2426                         }
2427                 }
2428                 rcu_read_unlock();
2429                 return -EINVAL;
2430         }
2431         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2432         rcu_read_unlock();
2433         return res;
2434 }
2435 EXPORT_SYMBOL(ip_route_input_common);
2436
2437 /* called with rcu_read_lock() */
2438 static struct rtable *__mkroute_output(const struct fib_result *res,
2439                                        const struct flowi4 *fl4,
2440                                        __be32 orig_daddr, __be32 orig_saddr,
2441                                        int orig_oif, __u8 orig_rtos,
2442                                        struct net_device *dev_out,
2443                                        unsigned int flags)
2444 {
2445         struct fib_info *fi = res->fi;
2446         struct in_device *in_dev;
2447         u16 type = res->type;
2448         struct rtable *rth;
2449
2450         in_dev = __in_dev_get_rcu(dev_out);
2451         if (!in_dev)
2452                 return ERR_PTR(-EINVAL);
2453
2454         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2455                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2456                         return ERR_PTR(-EINVAL);
2457
2458         if (ipv4_is_lbcast(fl4->daddr))
2459                 type = RTN_BROADCAST;
2460         else if (ipv4_is_multicast(fl4->daddr))
2461                 type = RTN_MULTICAST;
2462         else if (ipv4_is_zeronet(fl4->daddr))
2463                 return ERR_PTR(-EINVAL);
2464
2465         if (dev_out->flags & IFF_LOOPBACK)
2466                 flags |= RTCF_LOCAL;
2467
2468         if (type == RTN_BROADCAST) {
2469                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2470                 fi = NULL;
2471         } else if (type == RTN_MULTICAST) {
2472                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2473                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2474                                      fl4->flowi4_proto))
2475                         flags &= ~RTCF_LOCAL;
2476                 /* If multicast route do not exist use
2477                  * default one, but do not gateway in this case.
2478                  * Yes, it is hack.
2479                  */
2480                 if (fi && res->prefixlen < 4)
2481                         fi = NULL;
2482         }
2483
2484         rth = rt_dst_alloc(dev_out,
2485                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2486                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2487         if (!rth)
2488                 return ERR_PTR(-ENOBUFS);
2489
2490         rth->dst.output = ip_output;
2491
2492         rth->rt_key_dst = orig_daddr;
2493         rth->rt_key_src = orig_saddr;
2494         rth->rt_genid = rt_genid(dev_net(dev_out));
2495         rth->rt_flags   = flags;
2496         rth->rt_type    = type;
2497         rth->rt_key_tos = orig_rtos;
2498         rth->rt_dst     = fl4->daddr;
2499         rth->rt_src     = fl4->saddr;
2500         rth->rt_route_iif = 0;
2501         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2502         rth->rt_oif     = orig_oif;
2503         rth->rt_mark    = fl4->flowi4_mark;
2504         rth->rt_pmtu    = 0;
2505         rth->rt_gateway = fl4->daddr;
2506         rth->fi = NULL;
2507
2508         RT_CACHE_STAT_INC(out_slow_tot);
2509
2510         if (flags & RTCF_LOCAL)
2511                 rth->dst.input = ip_local_deliver;
2512         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2513                 if (flags & RTCF_LOCAL &&
2514                     !(dev_out->flags & IFF_LOOPBACK)) {
2515                         rth->dst.output = ip_mc_output;
2516                         RT_CACHE_STAT_INC(out_slow_mc);
2517                 }
2518 #ifdef CONFIG_IP_MROUTE
2519                 if (type == RTN_MULTICAST) {
2520                         if (IN_DEV_MFORWARD(in_dev) &&
2521                             !ipv4_is_local_multicast(fl4->daddr)) {
2522                                 rth->dst.input = ip_mr_input;
2523                                 rth->dst.output = ip_mc_output;
2524                         }
2525                 }
2526 #endif
2527         }
2528
2529         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2530
2531         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2532                 rth->dst.flags |= DST_NOCACHE;
2533
2534         return rth;
2535 }
2536
2537 /*
2538  * Major route resolver routine.
2539  * called with rcu_read_lock();
2540  */
2541
2542 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2543 {
2544         struct net_device *dev_out = NULL;
2545         __u8 tos = RT_FL_TOS(fl4);
2546         unsigned int flags = 0;
2547         struct fib_result res;
2548         struct rtable *rth;
2549         __be32 orig_daddr;
2550         __be32 orig_saddr;
2551         int orig_oif;
2552
2553         res.tclassid    = 0;
2554         res.fi          = NULL;
2555         res.table       = NULL;
2556
2557         orig_daddr = fl4->daddr;
2558         orig_saddr = fl4->saddr;
2559         orig_oif = fl4->flowi4_oif;
2560
2561         fl4->flowi4_iif = net->loopback_dev->ifindex;
2562         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2563         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2564                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2565
2566         rcu_read_lock();
2567         if (fl4->saddr) {
2568                 rth = ERR_PTR(-EINVAL);
2569                 if (ipv4_is_multicast(fl4->saddr) ||
2570                     ipv4_is_lbcast(fl4->saddr) ||
2571                     ipv4_is_zeronet(fl4->saddr))
2572                         goto out;
2573
2574                 /* I removed check for oif == dev_out->oif here.
2575                    It was wrong for two reasons:
2576                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2577                       is assigned to multiple interfaces.
2578                    2. Moreover, we are allowed to send packets with saddr
2579                       of another iface. --ANK
2580                  */
2581
2582                 if (fl4->flowi4_oif == 0 &&
2583                     (ipv4_is_multicast(fl4->daddr) ||
2584                      ipv4_is_lbcast(fl4->daddr))) {
2585                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2586                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2587                         if (dev_out == NULL)
2588                                 goto out;
2589
2590                         /* Special hack: user can direct multicasts
2591                            and limited broadcast via necessary interface
2592                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2593                            This hack is not just for fun, it allows
2594                            vic,vat and friends to work.
2595                            They bind socket to loopback, set ttl to zero
2596                            and expect that it will work.
2597                            From the viewpoint of routing cache they are broken,
2598                            because we are not allowed to build multicast path
2599                            with loopback source addr (look, routing cache
2600                            cannot know, that ttl is zero, so that packet
2601                            will not leave this host and route is valid).
2602                            Luckily, this hack is good workaround.
2603                          */
2604
2605                         fl4->flowi4_oif = dev_out->ifindex;
2606                         goto make_route;
2607                 }
2608
2609                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2610                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2611                         if (!__ip_dev_find(net, fl4->saddr, false))
2612                                 goto out;
2613                 }
2614         }
2615
2616
2617         if (fl4->flowi4_oif) {
2618                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2619                 rth = ERR_PTR(-ENODEV);
2620                 if (dev_out == NULL)
2621                         goto out;
2622
2623                 /* RACE: Check return value of inet_select_addr instead. */
2624                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2625                         rth = ERR_PTR(-ENETUNREACH);
2626                         goto out;
2627                 }
2628                 if (ipv4_is_local_multicast(fl4->daddr) ||
2629                     ipv4_is_lbcast(fl4->daddr)) {
2630                         if (!fl4->saddr)
2631                                 fl4->saddr = inet_select_addr(dev_out, 0,
2632                                                               RT_SCOPE_LINK);
2633                         goto make_route;
2634                 }
2635                 if (fl4->saddr) {
2636                         if (ipv4_is_multicast(fl4->daddr))
2637                                 fl4->saddr = inet_select_addr(dev_out, 0,
2638                                                               fl4->flowi4_scope);
2639                         else if (!fl4->daddr)
2640                                 fl4->saddr = inet_select_addr(dev_out, 0,
2641                                                               RT_SCOPE_HOST);
2642                 }
2643         }
2644
2645         if (!fl4->daddr) {
2646                 fl4->daddr = fl4->saddr;
2647                 if (!fl4->daddr)
2648                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2649                 dev_out = net->loopback_dev;
2650                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2651                 res.type = RTN_LOCAL;
2652                 flags |= RTCF_LOCAL;
2653                 goto make_route;
2654         }
2655
2656         if (fib_lookup(net, fl4, &res)) {
2657                 res.fi = NULL;
2658                 res.table = NULL;
2659                 if (fl4->flowi4_oif) {
2660                         /* Apparently, routing tables are wrong. Assume,
2661                            that the destination is on link.
2662
2663                            WHY? DW.
2664                            Because we are allowed to send to iface
2665                            even if it has NO routes and NO assigned
2666                            addresses. When oif is specified, routing
2667                            tables are looked up with only one purpose:
2668                            to catch if destination is gatewayed, rather than
2669                            direct. Moreover, if MSG_DONTROUTE is set,
2670                            we send packet, ignoring both routing tables
2671                            and ifaddr state. --ANK
2672
2673
2674                            We could make it even if oif is unknown,
2675                            likely IPv6, but we do not.
2676                          */
2677
2678                         if (fl4->saddr == 0)
2679                                 fl4->saddr = inet_select_addr(dev_out, 0,
2680                                                               RT_SCOPE_LINK);
2681                         res.type = RTN_UNICAST;
2682                         goto make_route;
2683                 }
2684                 rth = ERR_PTR(-ENETUNREACH);
2685                 goto out;
2686         }
2687
2688         if (res.type == RTN_LOCAL) {
2689                 if (!fl4->saddr) {
2690                         if (res.fi->fib_prefsrc)
2691                                 fl4->saddr = res.fi->fib_prefsrc;
2692                         else
2693                                 fl4->saddr = fl4->daddr;
2694                 }
2695                 dev_out = net->loopback_dev;
2696                 fl4->flowi4_oif = dev_out->ifindex;
2697                 res.fi = NULL;
2698                 flags |= RTCF_LOCAL;
2699                 goto make_route;
2700         }
2701
2702 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2703         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2704                 fib_select_multipath(&res);
2705         else
2706 #endif
2707         if (!res.prefixlen &&
2708             res.table->tb_num_default > 1 &&
2709             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2710                 fib_select_default(&res);
2711
2712         if (!fl4->saddr)
2713                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2714
2715         dev_out = FIB_RES_DEV(res);
2716         fl4->flowi4_oif = dev_out->ifindex;
2717
2718
2719 make_route:
2720         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2721                                tos, dev_out, flags);
2722         if (!IS_ERR(rth)) {
2723                 unsigned int hash;
2724
2725                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2726                                rt_genid(dev_net(dev_out)));
2727                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2728         }
2729
2730 out:
2731         rcu_read_unlock();
2732         return rth;
2733 }
2734
2735 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2736 {
2737         struct rtable *rth;
2738         unsigned int hash;
2739
2740         if (!rt_caching(net))
2741                 goto slow_output;
2742
2743         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2744
2745         rcu_read_lock_bh();
2746         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2747                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2748                 if (rth->rt_key_dst == flp4->daddr &&
2749                     rth->rt_key_src == flp4->saddr &&
2750                     rt_is_output_route(rth) &&
2751                     rth->rt_oif == flp4->flowi4_oif &&
2752                     rth->rt_mark == flp4->flowi4_mark &&
2753                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2754                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2755                     net_eq(dev_net(rth->dst.dev), net) &&
2756                     !rt_is_expired(rth)) {
2757                         dst_use(&rth->dst, jiffies);
2758                         RT_CACHE_STAT_INC(out_hit);
2759                         rcu_read_unlock_bh();
2760                         if (!flp4->saddr)
2761                                 flp4->saddr = rth->rt_src;
2762                         if (!flp4->daddr)
2763                                 flp4->daddr = rth->rt_dst;
2764                         return rth;
2765                 }
2766                 RT_CACHE_STAT_INC(out_hlist_search);
2767         }
2768         rcu_read_unlock_bh();
2769
2770 slow_output:
2771         return ip_route_output_slow(net, flp4);
2772 }
2773 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2774
2775 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2776 {
2777         return NULL;
2778 }
2779
2780 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2781 {
2782         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2783
2784         return mtu ? : dst->dev->mtu;
2785 }
2786
2787 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2788                                           struct sk_buff *skb, u32 mtu)
2789 {
2790 }
2791
2792 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2793                                        struct sk_buff *skb)
2794 {
2795 }
2796
2797 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2798                                           unsigned long old)
2799 {
2800         return NULL;
2801 }
2802
2803 static struct dst_ops ipv4_dst_blackhole_ops = {
2804         .family                 =       AF_INET,
2805         .protocol               =       cpu_to_be16(ETH_P_IP),
2806         .destroy                =       ipv4_dst_destroy,
2807         .check                  =       ipv4_blackhole_dst_check,
2808         .mtu                    =       ipv4_blackhole_mtu,
2809         .default_advmss         =       ipv4_default_advmss,
2810         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2811         .redirect               =       ipv4_rt_blackhole_redirect,
2812         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2813         .neigh_lookup           =       ipv4_neigh_lookup,
2814 };
2815
2816 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2817 {
2818         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2819         struct rtable *ort = (struct rtable *) dst_orig;
2820
2821         if (rt) {
2822                 struct dst_entry *new = &rt->dst;
2823
2824                 new->__use = 1;
2825                 new->input = dst_discard;
2826                 new->output = dst_discard;
2827
2828                 new->dev = ort->dst.dev;
2829                 if (new->dev)
2830                         dev_hold(new->dev);
2831
2832                 rt->rt_key_dst = ort->rt_key_dst;
2833                 rt->rt_key_src = ort->rt_key_src;
2834                 rt->rt_key_tos = ort->rt_key_tos;
2835                 rt->rt_route_iif = ort->rt_route_iif;
2836                 rt->rt_iif = ort->rt_iif;
2837                 rt->rt_oif = ort->rt_oif;
2838                 rt->rt_mark = ort->rt_mark;
2839                 rt->rt_pmtu = ort->rt_pmtu;
2840
2841                 rt->rt_genid = rt_genid(net);
2842                 rt->rt_flags = ort->rt_flags;
2843                 rt->rt_type = ort->rt_type;
2844                 rt->rt_dst = ort->rt_dst;
2845                 rt->rt_src = ort->rt_src;
2846                 rt->rt_gateway = ort->rt_gateway;
2847                 rt->fi = ort->fi;
2848                 if (rt->fi)
2849                         atomic_inc(&rt->fi->fib_clntref);
2850
2851                 dst_free(new);
2852         }
2853
2854         dst_release(dst_orig);
2855
2856         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2857 }
2858
2859 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2860                                     struct sock *sk)
2861 {
2862         struct rtable *rt = __ip_route_output_key(net, flp4);
2863
2864         if (IS_ERR(rt))
2865                 return rt;
2866
2867         if (flp4->flowi4_proto)
2868                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2869                                                    flowi4_to_flowi(flp4),
2870                                                    sk, 0);
2871
2872         return rt;
2873 }
2874 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2875
2876 static int rt_fill_info(struct net *net,
2877                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2878                         int nowait, unsigned int flags)
2879 {
2880         struct rtable *rt = skb_rtable(skb);
2881         struct rtmsg *r;
2882         struct nlmsghdr *nlh;
2883         unsigned long expires = 0;
2884         u32 error;
2885
2886         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2887         if (nlh == NULL)
2888                 return -EMSGSIZE;
2889
2890         r = nlmsg_data(nlh);
2891         r->rtm_family    = AF_INET;
2892         r->rtm_dst_len  = 32;
2893         r->rtm_src_len  = 0;
2894         r->rtm_tos      = rt->rt_key_tos;
2895         r->rtm_table    = RT_TABLE_MAIN;
2896         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2897                 goto nla_put_failure;
2898         r->rtm_type     = rt->rt_type;
2899         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2900         r->rtm_protocol = RTPROT_UNSPEC;
2901         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2902         if (rt->rt_flags & RTCF_NOTIFY)
2903                 r->rtm_flags |= RTM_F_NOTIFY;
2904
2905         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2906                 goto nla_put_failure;
2907         if (rt->rt_key_src) {
2908                 r->rtm_src_len = 32;
2909                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2910                         goto nla_put_failure;
2911         }
2912         if (rt->dst.dev &&
2913             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2914                 goto nla_put_failure;
2915 #ifdef CONFIG_IP_ROUTE_CLASSID
2916         if (rt->dst.tclassid &&
2917             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2918                 goto nla_put_failure;
2919 #endif
2920         if (!rt_is_input_route(rt) &&
2921             rt->rt_src != rt->rt_key_src) {
2922                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2923                         goto nla_put_failure;
2924         }
2925         if (rt->rt_dst != rt->rt_gateway &&
2926             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2927                 goto nla_put_failure;
2928
2929         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2930                 goto nla_put_failure;
2931
2932         if (rt->rt_mark &&
2933             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2934                 goto nla_put_failure;
2935
2936         error = rt->dst.error;
2937         expires = rt->dst.expires;
2938         if (expires) {
2939                 if (time_before(jiffies, expires))
2940                         expires -= jiffies;
2941                 else
2942                         expires = 0;
2943         }
2944
2945         if (rt_is_input_route(rt)) {
2946 #ifdef CONFIG_IP_MROUTE
2947                 __be32 dst = rt->rt_dst;
2948
2949                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2950                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2951                         int err = ipmr_get_route(net, skb,
2952                                                  rt->rt_src, rt->rt_dst,
2953                                                  r, nowait);
2954                         if (err <= 0) {
2955                                 if (!nowait) {
2956                                         if (err == 0)
2957                                                 return 0;
2958                                         goto nla_put_failure;
2959                                 } else {
2960                                         if (err == -EMSGSIZE)
2961                                                 goto nla_put_failure;
2962                                         error = err;
2963                                 }
2964                         }
2965                 } else
2966 #endif
2967                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2968                                 goto nla_put_failure;
2969         }
2970
2971         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2972                 goto nla_put_failure;
2973
2974         return nlmsg_end(skb, nlh);
2975
2976 nla_put_failure:
2977         nlmsg_cancel(skb, nlh);
2978         return -EMSGSIZE;
2979 }
2980
2981 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2982 {
2983         struct net *net = sock_net(in_skb->sk);
2984         struct rtmsg *rtm;
2985         struct nlattr *tb[RTA_MAX+1];
2986         struct rtable *rt = NULL;
2987         __be32 dst = 0;
2988         __be32 src = 0;
2989         u32 iif;
2990         int err;
2991         int mark;
2992         struct sk_buff *skb;
2993
2994         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2995         if (err < 0)
2996                 goto errout;
2997
2998         rtm = nlmsg_data(nlh);
2999
3000         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3001         if (skb == NULL) {
3002                 err = -ENOBUFS;
3003                 goto errout;
3004         }
3005
3006         /* Reserve room for dummy headers, this skb can pass
3007            through good chunk of routing engine.
3008          */
3009         skb_reset_mac_header(skb);
3010         skb_reset_network_header(skb);
3011
3012         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3013         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3014         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3015
3016         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3017         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3018         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3019         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3020
3021         if (iif) {
3022                 struct net_device *dev;
3023
3024                 dev = __dev_get_by_index(net, iif);
3025                 if (dev == NULL) {
3026                         err = -ENODEV;
3027                         goto errout_free;
3028                 }
3029
3030                 skb->protocol   = htons(ETH_P_IP);
3031                 skb->dev        = dev;
3032                 skb->mark       = mark;
3033                 local_bh_disable();
3034                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3035                 local_bh_enable();
3036
3037                 rt = skb_rtable(skb);
3038                 if (err == 0 && rt->dst.error)
3039                         err = -rt->dst.error;
3040         } else {
3041                 struct flowi4 fl4 = {
3042                         .daddr = dst,
3043                         .saddr = src,
3044                         .flowi4_tos = rtm->rtm_tos,
3045                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3046                         .flowi4_mark = mark,
3047                 };
3048                 rt = ip_route_output_key(net, &fl4);
3049
3050                 err = 0;
3051                 if (IS_ERR(rt))
3052                         err = PTR_ERR(rt);
3053         }
3054
3055         if (err)
3056                 goto errout_free;
3057
3058         skb_dst_set(skb, &rt->dst);
3059         if (rtm->rtm_flags & RTM_F_NOTIFY)
3060                 rt->rt_flags |= RTCF_NOTIFY;
3061
3062         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3063                            RTM_NEWROUTE, 0, 0);
3064         if (err <= 0)
3065                 goto errout_free;
3066
3067         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3068 errout:
3069         return err;
3070
3071 errout_free:
3072         kfree_skb(skb);
3073         goto errout;
3074 }
3075
3076 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3077 {
3078         struct rtable *rt;
3079         int h, s_h;
3080         int idx, s_idx;
3081         struct net *net;
3082
3083         net = sock_net(skb->sk);
3084
3085         s_h = cb->args[0];
3086         if (s_h < 0)
3087                 s_h = 0;
3088         s_idx = idx = cb->args[1];
3089         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3090                 if (!rt_hash_table[h].chain)
3091                         continue;
3092                 rcu_read_lock_bh();
3093                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3094                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3095                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3096                                 continue;
3097                         if (rt_is_expired(rt))
3098                                 continue;
3099                         skb_dst_set_noref(skb, &rt->dst);
3100                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3101                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3102                                          1, NLM_F_MULTI) <= 0) {
3103                                 skb_dst_drop(skb);
3104                                 rcu_read_unlock_bh();
3105                                 goto done;
3106                         }
3107                         skb_dst_drop(skb);
3108                 }
3109                 rcu_read_unlock_bh();
3110         }
3111
3112 done:
3113         cb->args[0] = h;
3114         cb->args[1] = idx;
3115         return skb->len;
3116 }
3117
3118 void ip_rt_multicast_event(struct in_device *in_dev)
3119 {
3120         rt_cache_flush(dev_net(in_dev->dev), 0);
3121 }
3122
3123 #ifdef CONFIG_SYSCTL
3124 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3125                                         void __user *buffer,
3126                                         size_t *lenp, loff_t *ppos)
3127 {
3128         if (write) {
3129                 int flush_delay;
3130                 ctl_table ctl;
3131                 struct net *net;
3132
3133                 memcpy(&ctl, __ctl, sizeof(ctl));
3134                 ctl.data = &flush_delay;
3135                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3136
3137                 net = (struct net *)__ctl->extra1;
3138                 rt_cache_flush(net, flush_delay);
3139                 return 0;
3140         }
3141
3142         return -EINVAL;
3143 }
3144
3145 static ctl_table ipv4_route_table[] = {
3146         {
3147                 .procname       = "gc_thresh",
3148                 .data           = &ipv4_dst_ops.gc_thresh,
3149                 .maxlen         = sizeof(int),
3150                 .mode           = 0644,
3151                 .proc_handler   = proc_dointvec,
3152         },
3153         {
3154                 .procname       = "max_size",
3155                 .data           = &ip_rt_max_size,
3156                 .maxlen         = sizeof(int),
3157                 .mode           = 0644,
3158                 .proc_handler   = proc_dointvec,
3159         },
3160         {
3161                 /*  Deprecated. Use gc_min_interval_ms */
3162
3163                 .procname       = "gc_min_interval",
3164                 .data           = &ip_rt_gc_min_interval,
3165                 .maxlen         = sizeof(int),
3166                 .mode           = 0644,
3167                 .proc_handler   = proc_dointvec_jiffies,
3168         },
3169         {
3170                 .procname       = "gc_min_interval_ms",
3171                 .data           = &ip_rt_gc_min_interval,
3172                 .maxlen         = sizeof(int),
3173                 .mode           = 0644,
3174                 .proc_handler   = proc_dointvec_ms_jiffies,
3175         },
3176         {
3177                 .procname       = "gc_timeout",
3178                 .data           = &ip_rt_gc_timeout,
3179                 .maxlen         = sizeof(int),
3180                 .mode           = 0644,
3181                 .proc_handler   = proc_dointvec_jiffies,
3182         },
3183         {
3184                 .procname       = "gc_interval",
3185                 .data           = &ip_rt_gc_interval,
3186                 .maxlen         = sizeof(int),
3187                 .mode           = 0644,
3188                 .proc_handler   = proc_dointvec_jiffies,
3189         },
3190         {
3191                 .procname       = "redirect_load",
3192                 .data           = &ip_rt_redirect_load,
3193                 .maxlen         = sizeof(int),
3194                 .mode           = 0644,
3195                 .proc_handler   = proc_dointvec,
3196         },
3197         {
3198                 .procname       = "redirect_number",
3199                 .data           = &ip_rt_redirect_number,
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0644,
3202                 .proc_handler   = proc_dointvec,
3203         },
3204         {
3205                 .procname       = "redirect_silence",
3206                 .data           = &ip_rt_redirect_silence,
3207                 .maxlen         = sizeof(int),
3208                 .mode           = 0644,
3209                 .proc_handler   = proc_dointvec,
3210         },
3211         {
3212                 .procname       = "error_cost",
3213                 .data           = &ip_rt_error_cost,
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0644,
3216                 .proc_handler   = proc_dointvec,
3217         },
3218         {
3219                 .procname       = "error_burst",
3220                 .data           = &ip_rt_error_burst,
3221                 .maxlen         = sizeof(int),
3222                 .mode           = 0644,
3223                 .proc_handler   = proc_dointvec,
3224         },
3225         {
3226                 .procname       = "gc_elasticity",
3227                 .data           = &ip_rt_gc_elasticity,
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0644,
3230                 .proc_handler   = proc_dointvec,
3231         },
3232         {
3233                 .procname       = "mtu_expires",
3234                 .data           = &ip_rt_mtu_expires,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = proc_dointvec_jiffies,
3238         },
3239         {
3240                 .procname       = "min_pmtu",
3241                 .data           = &ip_rt_min_pmtu,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 .procname       = "min_adv_mss",
3248                 .data           = &ip_rt_min_advmss,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec,
3252         },
3253         { }
3254 };
3255
3256 static struct ctl_table ipv4_route_flush_table[] = {
3257         {
3258                 .procname       = "flush",
3259                 .maxlen         = sizeof(int),
3260                 .mode           = 0200,
3261                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3262         },
3263         { },
3264 };
3265
3266 static __net_init int sysctl_route_net_init(struct net *net)
3267 {
3268         struct ctl_table *tbl;
3269
3270         tbl = ipv4_route_flush_table;
3271         if (!net_eq(net, &init_net)) {
3272                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3273                 if (tbl == NULL)
3274                         goto err_dup;
3275         }
3276         tbl[0].extra1 = net;
3277
3278         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3279         if (net->ipv4.route_hdr == NULL)
3280                 goto err_reg;
3281         return 0;
3282
3283 err_reg:
3284         if (tbl != ipv4_route_flush_table)
3285                 kfree(tbl);
3286 err_dup:
3287         return -ENOMEM;
3288 }
3289
3290 static __net_exit void sysctl_route_net_exit(struct net *net)
3291 {
3292         struct ctl_table *tbl;
3293
3294         tbl = net->ipv4.route_hdr->ctl_table_arg;
3295         unregister_net_sysctl_table(net->ipv4.route_hdr);
3296         BUG_ON(tbl == ipv4_route_flush_table);
3297         kfree(tbl);
3298 }
3299
3300 static __net_initdata struct pernet_operations sysctl_route_ops = {
3301         .init = sysctl_route_net_init,
3302         .exit = sysctl_route_net_exit,
3303 };
3304 #endif
3305
3306 static __net_init int rt_genid_init(struct net *net)
3307 {
3308         get_random_bytes(&net->ipv4.rt_genid,
3309                          sizeof(net->ipv4.rt_genid));
3310         get_random_bytes(&net->ipv4.dev_addr_genid,
3311                          sizeof(net->ipv4.dev_addr_genid));
3312         return 0;
3313 }
3314
3315 static __net_initdata struct pernet_operations rt_genid_ops = {
3316         .init = rt_genid_init,
3317 };
3318
3319 static int __net_init ipv4_inetpeer_init(struct net *net)
3320 {
3321         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3322
3323         if (!bp)
3324                 return -ENOMEM;
3325         inet_peer_base_init(bp);
3326         net->ipv4.peers = bp;
3327         return 0;
3328 }
3329
3330 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3331 {
3332         struct inet_peer_base *bp = net->ipv4.peers;
3333
3334         net->ipv4.peers = NULL;
3335         inetpeer_invalidate_tree(bp);
3336         kfree(bp);
3337 }
3338
3339 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3340         .init   =       ipv4_inetpeer_init,
3341         .exit   =       ipv4_inetpeer_exit,
3342 };
3343
3344 #ifdef CONFIG_IP_ROUTE_CLASSID
3345 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3346 #endif /* CONFIG_IP_ROUTE_CLASSID */
3347
3348 static __initdata unsigned long rhash_entries;
3349 static int __init set_rhash_entries(char *str)
3350 {
3351         ssize_t ret;
3352
3353         if (!str)
3354                 return 0;
3355
3356         ret = kstrtoul(str, 0, &rhash_entries);
3357         if (ret)
3358                 return 0;
3359
3360         return 1;
3361 }
3362 __setup("rhash_entries=", set_rhash_entries);
3363
3364 int __init ip_rt_init(void)
3365 {
3366         int rc = 0;
3367
3368 #ifdef CONFIG_IP_ROUTE_CLASSID
3369         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3370         if (!ip_rt_acct)
3371                 panic("IP: failed to allocate ip_rt_acct\n");
3372 #endif
3373
3374         ipv4_dst_ops.kmem_cachep =
3375                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3376                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3377
3378         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3379
3380         if (dst_entries_init(&ipv4_dst_ops) < 0)
3381                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3382
3383         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3384                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3385
3386         rt_hash_table = (struct rt_hash_bucket *)
3387                 alloc_large_system_hash("IP route cache",
3388                                         sizeof(struct rt_hash_bucket),
3389                                         rhash_entries,
3390                                         (totalram_pages >= 128 * 1024) ?
3391                                         15 : 17,
3392                                         0,
3393                                         &rt_hash_log,
3394                                         &rt_hash_mask,
3395                                         0,
3396                                         rhash_entries ? 0 : 512 * 1024);
3397         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3398         rt_hash_lock_init();
3399
3400         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3401         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3402
3403         devinet_init();
3404         ip_fib_init();
3405
3406         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3407         expires_ljiffies = jiffies;
3408         schedule_delayed_work(&expires_work,
3409                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3410
3411         if (ip_rt_proc_init())
3412                 pr_err("Unable to create route proc files\n");
3413 #ifdef CONFIG_XFRM
3414         xfrm_init();
3415         xfrm4_init(ip_rt_max_size);
3416 #endif
3417         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3418
3419 #ifdef CONFIG_SYSCTL
3420         register_pernet_subsys(&sysctl_route_ops);
3421 #endif
3422         register_pernet_subsys(&rt_genid_ops);
3423         register_pernet_subsys(&ipv4_inetpeer_ops);
3424         return rc;
3425 }
3426
3427 #ifdef CONFIG_SYSCTL
3428 /*
3429  * We really need to sanitize the damn ipv4 init order, then all
3430  * this nonsense will go away.
3431  */
3432 void __init ip_static_sysctl_init(void)
3433 {
3434         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3435 }
3436 #endif