net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 152                                            struct sk_buff *skb, u32 mtu);
 153 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 154                                         struct sk_buff *skb);
 155 static int rt_garbage_collect(struct dst_ops *ops);
 156
 157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 158                             int how)
 159 {
 160 }
 161
 162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 163 {
 164         WARN_ON(1);
 165         return NULL;
 166 }
 167
 168 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 169                                            struct sk_buff *skb,
 170                                            const void *daddr);
 171
 172 static struct dst_ops ipv4_dst_ops = {
 173         .family =               AF_INET,
 174         .protocol =             cpu_to_be16(ETH_P_IP),
 175         .gc =                   rt_garbage_collect,
 176         .check =                ipv4_dst_check,
 177         .default_advmss =       ipv4_default_advmss,
 178         .mtu =                  ipv4_mtu,
 179         .cow_metrics =          ipv4_cow_metrics,
 180         .destroy =              ipv4_dst_destroy,
 181         .ifdown =               ipv4_dst_ifdown,
 182         .negative_advice =      ipv4_negative_advice,
 183         .link_failure =         ipv4_link_failure,
 184         .update_pmtu =          ip_rt_update_pmtu,
 185         .redirect =             ip_do_redirect,
 186         .local_out =            __ip_local_out,
 187         .neigh_lookup =         ipv4_neigh_lookup,
 188 };
 189
 190 #define ECN_OR_COST(class)      TC_PRIO_##class
 191
 192 const __u8 ip_tos2prio[16] = {
 193         TC_PRIO_BESTEFFORT,
 194         ECN_OR_COST(BESTEFFORT),
 195         TC_PRIO_BESTEFFORT,
 196         ECN_OR_COST(BESTEFFORT),
 197         TC_PRIO_BULK,
 198         ECN_OR_COST(BULK),
 199         TC_PRIO_BULK,
 200         ECN_OR_COST(BULK),
 201         TC_PRIO_INTERACTIVE,
 202         ECN_OR_COST(INTERACTIVE),
 203         TC_PRIO_INTERACTIVE,
 204         ECN_OR_COST(INTERACTIVE),
 205         TC_PRIO_INTERACTIVE_BULK,
 206         ECN_OR_COST(INTERACTIVE_BULK),
 207         TC_PRIO_INTERACTIVE_BULK,
 208         ECN_OR_COST(INTERACTIVE_BULK)
 209 };
 210 EXPORT_SYMBOL(ip_tos2prio);
 211
 212 /*
 213  * Route cache.
 214  */
 215
 216 /* The locking scheme is rather straight forward:
 217  *
 218  * 1) Read-Copy Update protects the buckets of the central route hash.
 219  * 2) Only writers remove entries, and they hold the lock
 220  *    as they look at rtable reference counts.
 221  * 3) Only readers acquire references to rtable entries,
 222  *    they do so with atomic increments and with the
 223  *    lock held.
 224  */
 225
 226 struct rt_hash_bucket {
 227         struct rtable __rcu     *chain;
 228 };
 229
 230 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 231         defined(CONFIG_PROVE_LOCKING)
 232 /*
 233  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 234  * The size of this table is a power of two and depends on the number of CPUS.
 235  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 236  */
 237 #ifdef CONFIG_LOCKDEP
 238 # define RT_HASH_LOCK_SZ        256
 239 #else
 240 # if NR_CPUS >= 32
 241 #  define RT_HASH_LOCK_SZ       4096
 242 # elif NR_CPUS >= 16
 243 #  define RT_HASH_LOCK_SZ       2048
 244 # elif NR_CPUS >= 8
 245 #  define RT_HASH_LOCK_SZ       1024
 246 # elif NR_CPUS >= 4
 247 #  define RT_HASH_LOCK_SZ       512
 248 # else
 249 #  define RT_HASH_LOCK_SZ       256
 250 # endif
 251 #endif
 252
 253 static spinlock_t       *rt_hash_locks;
 254 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 255
 256 static __init void rt_hash_lock_init(void)
 257 {
 258         int i;
 259
 260         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 261                         GFP_KERNEL);
 262         if (!rt_hash_locks)
 263                 panic("IP: failed to allocate rt_hash_locks\n");
 264
 265         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 266                 spin_lock_init(&rt_hash_locks[i]);
 267 }
 268 #else
 269 # define rt_hash_lock_addr(slot) NULL
 270
 271 static inline void rt_hash_lock_init(void)
 272 {
 273 }
 274 #endif
 275
 276 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 277 static unsigned int             rt_hash_mask __read_mostly;
 278 static unsigned int             rt_hash_log  __read_mostly;
 279
 280 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 281 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 282
 283 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 284                                    int genid)
 285 {
 286         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 287                             idx, genid)
 288                 & rt_hash_mask;
 289 }
 290
 291 static inline int rt_genid(struct net *net)
 292 {
 293         return atomic_read(&net->ipv4.rt_genid);
 294 }
 295
 296 #ifdef CONFIG_PROC_FS
 297 struct rt_cache_iter_state {
 298         struct seq_net_private p;
 299         int bucket;
 300         int genid;
 301 };
 302
 303 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306         struct rtable *r = NULL;
 307
 308         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 309                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 310                         continue;
 311                 rcu_read_lock_bh();
 312                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 313                 while (r) {
 314                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 315                             r->rt_genid == st->genid)
 316                                 return r;
 317                         r = rcu_dereference_bh(r->dst.rt_next);
 318                 }
 319                 rcu_read_unlock_bh();
 320         }
 321         return r;
 322 }
 323
 324 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 325                                           struct rtable *r)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328
 329         r = rcu_dereference_bh(r->dst.rt_next);
 330         while (!r) {
 331                 rcu_read_unlock_bh();
 332                 do {
 333                         if (--st->bucket < 0)
 334                                 return NULL;
 335                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 336                 rcu_read_lock_bh();
 337                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 343                                         struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 347                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 348                         continue;
 349                 if (r->rt_genid == st->genid)
 350                         break;
 351         }
 352         return r;
 353 }
 354
 355 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 356 {
 357         struct rtable *r = rt_cache_get_first(seq);
 358
 359         if (r)
 360                 while (pos && (r = rt_cache_get_next(seq, r)))
 361                         --pos;
 362         return pos ? NULL : r;
 363 }
 364
 365 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 366 {
 367         struct rt_cache_iter_state *st = seq->private;
 368         if (*pos)
 369                 return rt_cache_get_idx(seq, *pos - 1);
 370         st->genid = rt_genid(seq_file_net(seq));
 371         return SEQ_START_TOKEN;
 372 }
 373
 374 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 375 {
 376         struct rtable *r;
 377
 378         if (v == SEQ_START_TOKEN)
 379                 r = rt_cache_get_first(seq);
 380         else
 381                 r = rt_cache_get_next(seq, v);
 382         ++*pos;
 383         return r;
 384 }
 385
 386 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 387 {
 388         if (v && v != SEQ_START_TOKEN)
 389                 rcu_read_unlock_bh();
 390 }
 391
 392 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 393 {
 394         if (v == SEQ_START_TOKEN)
 395                 seq_printf(seq, "%-127s\n",
 396                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 397                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 398                            "HHUptod\tSpecDst");
 399         else {
 400                 struct rtable *r = v;
 401                 int len;
 402
 403                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 404                            "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 405                            r->dst.dev ? r->dst.dev->name : "*",
 406                            (__force u32)r->rt_dst,
 407                            (__force u32)r->rt_gateway,
 408                            r->rt_flags, atomic_read(&r->dst.__refcnt),
 409                            r->dst.__use, 0, (__force u32)r->rt_src,
 410                            dst_metric_advmss(&r->dst) + 40,
 411                            dst_metric(&r->dst, RTAX_WINDOW), 0,
 412                            r->rt_key_tos,
 413                            -1, 0, 0, &len);
 414
 415                 seq_printf(seq, "%*s\n", 127 - len, "");
 416         }
 417         return 0;
 418 }
 419
 420 static const struct seq_operations rt_cache_seq_ops = {
 421         .start  = rt_cache_seq_start,
 422         .next   = rt_cache_seq_next,
 423         .stop   = rt_cache_seq_stop,
 424         .show   = rt_cache_seq_show,
 425 };
 426
 427 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 428 {
 429         return seq_open_net(inode, file, &rt_cache_seq_ops,
 430                         sizeof(struct rt_cache_iter_state));
 431 }
 432
 433 static const struct file_operations rt_cache_seq_fops = {
 434         .owner   = THIS_MODULE,
 435         .open    = rt_cache_seq_open,
 436         .read    = seq_read,
 437         .llseek  = seq_lseek,
 438         .release = seq_release_net,
 439 };
 440
 441
 442 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 443 {
 444         int cpu;
 445
 446         if (*pos == 0)
 447                 return SEQ_START_TOKEN;
 448
 449         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 450                 if (!cpu_possible(cpu))
 451                         continue;
 452                 *pos = cpu+1;
 453                 return &per_cpu(rt_cache_stat, cpu);
 454         }
 455         return NULL;
 456 }
 457
 458 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 459 {
 460         int cpu;
 461
 462         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 463                 if (!cpu_possible(cpu))
 464                         continue;
 465                 *pos = cpu+1;
 466                 return &per_cpu(rt_cache_stat, cpu);
 467         }
 468         return NULL;
 469
 470 }
 471
 472 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 473 {
 474
 475 }
 476
 477 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 478 {
 479         struct rt_cache_stat *st = v;
 480
 481         if (v == SEQ_START_TOKEN) {
 482                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 483                 return 0;
 484         }
 485
 486         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 487                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 488                    dst_entries_get_slow(&ipv4_dst_ops),
 489                    st->in_hit,
 490                    st->in_slow_tot,
 491                    st->in_slow_mc,
 492                    st->in_no_route,
 493                    st->in_brd,
 494                    st->in_martian_dst,
 495                    st->in_martian_src,
 496
 497                    st->out_hit,
 498                    st->out_slow_tot,
 499                    st->out_slow_mc,
 500
 501                    st->gc_total,
 502                    st->gc_ignored,
 503                    st->gc_goal_miss,
 504                    st->gc_dst_overflow,
 505                    st->in_hlist_search,
 506                    st->out_hlist_search
 507                 );
 508         return 0;
 509 }
 510
 511 static const struct seq_operations rt_cpu_seq_ops = {
 512         .start  = rt_cpu_seq_start,
 513         .next   = rt_cpu_seq_next,
 514         .stop   = rt_cpu_seq_stop,
 515         .show   = rt_cpu_seq_show,
 516 };
 517
 518
 519 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 520 {
 521         return seq_open(file, &rt_cpu_seq_ops);
 522 }
 523
 524 static const struct file_operations rt_cpu_seq_fops = {
 525         .owner   = THIS_MODULE,
 526         .open    = rt_cpu_seq_open,
 527         .read    = seq_read,
 528         .llseek  = seq_lseek,
 529         .release = seq_release,
 530 };
 531
 532 #ifdef CONFIG_IP_ROUTE_CLASSID
 533 static int rt_acct_proc_show(struct seq_file *m, void *v)
 534 {
 535         struct ip_rt_acct *dst, *src;
 536         unsigned int i, j;
 537
 538         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 539         if (!dst)
 540                 return -ENOMEM;
 541
 542         for_each_possible_cpu(i) {
 543                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 544                 for (j = 0; j < 256; j++) {
 545                         dst[j].o_bytes   += src[j].o_bytes;
 546                         dst[j].o_packets += src[j].o_packets;
 547                         dst[j].i_bytes   += src[j].i_bytes;
 548                         dst[j].i_packets += src[j].i_packets;
 549                 }
 550         }
 551
 552         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 553         kfree(dst);
 554         return 0;
 555 }
 556
 557 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 558 {
 559         return single_open(file, rt_acct_proc_show, NULL);
 560 }
 561
 562 static const struct file_operations rt_acct_proc_fops = {
 563         .owner          = THIS_MODULE,
 564         .open           = rt_acct_proc_open,
 565         .read           = seq_read,
 566         .llseek         = seq_lseek,
 567         .release        = single_release,
 568 };
 569 #endif
 570
 571 static int __net_init ip_rt_do_proc_init(struct net *net)
 572 {
 573         struct proc_dir_entry *pde;
 574
 575         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 576                         &rt_cache_seq_fops);
 577         if (!pde)
 578                 goto err1;
 579
 580         pde = proc_create("rt_cache", S_IRUGO,
 581                           net->proc_net_stat, &rt_cpu_seq_fops);
 582         if (!pde)
 583                 goto err2;
 584
 585 #ifdef CONFIG_IP_ROUTE_CLASSID
 586         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 587         if (!pde)
 588                 goto err3;
 589 #endif
 590         return 0;
 591
 592 #ifdef CONFIG_IP_ROUTE_CLASSID
 593 err3:
 594         remove_proc_entry("rt_cache", net->proc_net_stat);
 595 #endif
 596 err2:
 597         remove_proc_entry("rt_cache", net->proc_net);
 598 err1:
 599         return -ENOMEM;
 600 }
 601
 602 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 603 {
 604         remove_proc_entry("rt_cache", net->proc_net_stat);
 605         remove_proc_entry("rt_cache", net->proc_net);
 606 #ifdef CONFIG_IP_ROUTE_CLASSID
 607         remove_proc_entry("rt_acct", net->proc_net);
 608 #endif
 609 }
 610
 611 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 612         .init = ip_rt_do_proc_init,
 613         .exit = ip_rt_do_proc_exit,
 614 };
 615
 616 static int __init ip_rt_proc_init(void)
 617 {
 618         return register_pernet_subsys(&ip_rt_proc_ops);
 619 }
 620
 621 #else
 622 static inline int ip_rt_proc_init(void)
 623 {
 624         return 0;
 625 }
 626 #endif /* CONFIG_PROC_FS */
 627
 628 static inline void rt_free(struct rtable *rt)
 629 {
 630         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 631 }
 632
 633 static inline void rt_drop(struct rtable *rt)
 634 {
 635         ip_rt_put(rt);
 636         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 637 }
 638
 639 static inline int rt_fast_clean(struct rtable *rth)
 640 {
 641         /* Kill broadcast/multicast entries very aggresively, if they
 642            collide in hash table with more useful entries */
 643         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 644                 rt_is_input_route(rth) && rth->dst.rt_next;
 645 }
 646
 647 static inline int rt_valuable(struct rtable *rth)
 648 {
 649         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 650                 rth->dst.expires;
 651 }
 652
 653 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 654 {
 655         unsigned long age;
 656         int ret = 0;
 657
 658         if (atomic_read(&rth->dst.__refcnt))
 659                 goto out;
 660
 661         age = jiffies - rth->dst.lastuse;
 662         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 663             (age <= tmo2 && rt_valuable(rth)))
 664                 goto out;
 665         ret = 1;
 666 out:    return ret;
 667 }
 668
 669 /* Bits of score are:
 670  * 31: very valuable
 671  * 30: not quite useless
 672  * 29..0: usage counter
 673  */
 674 static inline u32 rt_score(struct rtable *rt)
 675 {
 676         u32 score = jiffies - rt->dst.lastuse;
 677
 678         score = ~score & ~(3<<30);
 679
 680         if (rt_valuable(rt))
 681                 score |= (1<<31);
 682
 683         if (rt_is_output_route(rt) ||
 684             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 685                 score |= (1<<30);
 686
 687         return score;
 688 }
 689
 690 static inline bool rt_caching(const struct net *net)
 691 {
 692         return net->ipv4.current_rt_cache_rebuild_count <=
 693                 net->ipv4.sysctl_rt_cache_rebuild_count;
 694 }
 695
 696 static inline bool compare_hash_inputs(const struct rtable *rt1,
 697                                        const struct rtable *rt2)
 698 {
 699         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 700                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 701                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 702 }
 703
 704 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 707                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 708                 (rt1->rt_mark ^ rt2->rt_mark) |
 709                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 710                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 711                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 712 }
 713
 714 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 715 {
 716         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 717 }
 718
 719 static inline int rt_is_expired(struct rtable *rth)
 720 {
 721         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 722 }
 723
 724 /*
 725  * Perform a full scan of hash table and free all entries.
 726  * Can be called by a softirq or a process.
 727  * In the later case, we want to be reschedule if necessary
 728  */
 729 static void rt_do_flush(struct net *net, int process_context)
 730 {
 731         unsigned int i;
 732         struct rtable *rth, *next;
 733
 734         for (i = 0; i <= rt_hash_mask; i++) {
 735                 struct rtable __rcu **pprev;
 736                 struct rtable *list;
 737
 738                 if (process_context && need_resched())
 739                         cond_resched();
 740                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 741                 if (!rth)
 742                         continue;
 743
 744                 spin_lock_bh(rt_hash_lock_addr(i));
 745
 746                 list = NULL;
 747                 pprev = &rt_hash_table[i].chain;
 748                 rth = rcu_dereference_protected(*pprev,
 749                         lockdep_is_held(rt_hash_lock_addr(i)));
 750
 751                 while (rth) {
 752                         next = rcu_dereference_protected(rth->dst.rt_next,
 753                                 lockdep_is_held(rt_hash_lock_addr(i)));
 754
 755                         if (!net ||
 756                             net_eq(dev_net(rth->dst.dev), net)) {
 757                                 rcu_assign_pointer(*pprev, next);
 758                                 rcu_assign_pointer(rth->dst.rt_next, list);
 759                                 list = rth;
 760                         } else {
 761                                 pprev = &rth->dst.rt_next;
 762                         }
 763                         rth = next;
 764                 }
 765
 766                 spin_unlock_bh(rt_hash_lock_addr(i));
 767
 768                 for (; list; list = next) {
 769                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 770                         rt_free(list);
 771                 }
 772         }
 773 }
 774
 775 /*
 776  * While freeing expired entries, we compute average chain length
 777  * and standard deviation, using fixed-point arithmetic.
 778  * This to have an estimation of rt_chain_length_max
 779  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 780  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 781  */
 782
 783 #define FRACT_BITS 3
 784 #define ONE (1UL << FRACT_BITS)
 785
 786 /*
 787  * Given a hash chain and an item in this hash chain,
 788  * find if a previous entry has the same hash_inputs
 789  * (but differs on tos, mark or oif)
 790  * Returns 0 if an alias is found.
 791  * Returns ONE if rth has no alias before itself.
 792  */
 793 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 794 {
 795         const struct rtable *aux = head;
 796
 797         while (aux != rth) {
 798                 if (compare_hash_inputs(aux, rth))
 799                         return 0;
 800                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 801         }
 802         return ONE;
 803 }
 804
 805 static void rt_check_expire(void)
 806 {
 807         static unsigned int rover;
 808         unsigned int i = rover, goal;
 809         struct rtable *rth;
 810         struct rtable __rcu **rthp;
 811         unsigned long samples = 0;
 812         unsigned long sum = 0, sum2 = 0;
 813         unsigned long delta;
 814         u64 mult;
 815
 816         delta = jiffies - expires_ljiffies;
 817         expires_ljiffies = jiffies;
 818         mult = ((u64)delta) << rt_hash_log;
 819         if (ip_rt_gc_timeout > 1)
 820                 do_div(mult, ip_rt_gc_timeout);
 821         goal = (unsigned int)mult;
 822         if (goal > rt_hash_mask)
 823                 goal = rt_hash_mask + 1;
 824         for (; goal > 0; goal--) {
 825                 unsigned long tmo = ip_rt_gc_timeout;
 826                 unsigned long length;
 827
 828                 i = (i + 1) & rt_hash_mask;
 829                 rthp = &rt_hash_table[i].chain;
 830
 831                 if (need_resched())
 832                         cond_resched();
 833
 834                 samples++;
 835
 836                 if (rcu_dereference_raw(*rthp) == NULL)
 837                         continue;
 838                 length = 0;
 839                 spin_lock_bh(rt_hash_lock_addr(i));
 840                 while ((rth = rcu_dereference_protected(*rthp,
 841                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 842                         prefetch(rth->dst.rt_next);
 843                         if (rt_is_expired(rth) ||
 844                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 845                                 *rthp = rth->dst.rt_next;
 846                                 rt_free(rth);
 847                                 continue;
 848                         }
 849
 850                         /* We only count entries on a chain with equal
 851                          * hash inputs once so that entries for
 852                          * different QOS levels, and other non-hash
 853                          * input attributes don't unfairly skew the
 854                          * length computation
 855                          */
 856                         tmo >>= 1;
 857                         rthp = &rth->dst.rt_next;
 858                         length += has_noalias(rt_hash_table[i].chain, rth);
 859                 }
 860                 spin_unlock_bh(rt_hash_lock_addr(i));
 861                 sum += length;
 862                 sum2 += length*length;
 863         }
 864         if (samples) {
 865                 unsigned long avg = sum / samples;
 866                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 867                 rt_chain_length_max = max_t(unsigned long,
 868                                         ip_rt_gc_elasticity,
 869                                         (avg + 4*sd) >> FRACT_BITS);
 870         }
 871         rover = i;
 872 }
 873
 874 /*
 875  * rt_worker_func() is run in process context.
 876  * we call rt_check_expire() to scan part of the hash table
 877  */
 878 static void rt_worker_func(struct work_struct *work)
 879 {
 880         rt_check_expire();
 881         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 882 }
 883
 884 /*
 885  * Perturbation of rt_genid by a small quantity [1..256]
 886  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 887  * many times (2^24) without giving recent rt_genid.
 888  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 889  */
 890 static void rt_cache_invalidate(struct net *net)
 891 {
 892         unsigned char shuffle;
 893
 894         get_random_bytes(&shuffle, sizeof(shuffle));
 895         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 896 }
 897
 898 /*
 899  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 900  * delay >= 0 : invalidate & flush cache (can be long)
 901  */
 902 void rt_cache_flush(struct net *net, int delay)
 903 {
 904         rt_cache_invalidate(net);
 905         if (delay >= 0)
 906                 rt_do_flush(net, !in_softirq());
 907 }
 908
 909 /* Flush previous cache invalidated entries from the cache */
 910 void rt_cache_flush_batch(struct net *net)
 911 {
 912         rt_do_flush(net, !in_softirq());
 913 }
 914
 915 static void rt_emergency_hash_rebuild(struct net *net)
 916 {
 917         net_warn_ratelimited("Route hash chain too long!\n");
 918         rt_cache_invalidate(net);
 919 }
 920
 921 /*
 922    Short description of GC goals.
 923
 924    We want to build algorithm, which will keep routing cache
 925    at some equilibrium point, when number of aged off entries
 926    is kept approximately equal to newly generated ones.
 927
 928    Current expiration strength is variable "expire".
 929    We try to adjust it dynamically, so that if networking
 930    is idle expires is large enough to keep enough of warm entries,
 931    and when load increases it reduces to limit cache size.
 932  */
 933
 934 static int rt_garbage_collect(struct dst_ops *ops)
 935 {
 936         static unsigned long expire = RT_GC_TIMEOUT;
 937         static unsigned long last_gc;
 938         static int rover;
 939         static int equilibrium;
 940         struct rtable *rth;
 941         struct rtable __rcu **rthp;
 942         unsigned long now = jiffies;
 943         int goal;
 944         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 945
 946         /*
 947          * Garbage collection is pretty expensive,
 948          * do not make it too frequently.
 949          */
 950
 951         RT_CACHE_STAT_INC(gc_total);
 952
 953         if (now - last_gc < ip_rt_gc_min_interval &&
 954             entries < ip_rt_max_size) {
 955                 RT_CACHE_STAT_INC(gc_ignored);
 956                 goto out;
 957         }
 958
 959         entries = dst_entries_get_slow(&ipv4_dst_ops);
 960         /* Calculate number of entries, which we want to expire now. */
 961         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 962         if (goal <= 0) {
 963                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 964                         equilibrium = ipv4_dst_ops.gc_thresh;
 965                 goal = entries - equilibrium;
 966                 if (goal > 0) {
 967                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 968                         goal = entries - equilibrium;
 969                 }
 970         } else {
 971                 /* We are in dangerous area. Try to reduce cache really
 972                  * aggressively.
 973                  */
 974                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 975                 equilibrium = entries - goal;
 976         }
 977
 978         if (now - last_gc >= ip_rt_gc_min_interval)
 979                 last_gc = now;
 980
 981         if (goal <= 0) {
 982                 equilibrium += goal;
 983                 goto work_done;
 984         }
 985
 986         do {
 987                 int i, k;
 988
 989                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 990                         unsigned long tmo = expire;
 991
 992                         k = (k + 1) & rt_hash_mask;
 993                         rthp = &rt_hash_table[k].chain;
 994                         spin_lock_bh(rt_hash_lock_addr(k));
 995                         while ((rth = rcu_dereference_protected(*rthp,
 996                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 997                                 if (!rt_is_expired(rth) &&
 998                                         !rt_may_expire(rth, tmo, expire)) {
 999                                         tmo >>= 1;
1000                                         rthp = &rth->dst.rt_next;
1001                                         continue;
1002                                 }
1003                                 *rthp = rth->dst.rt_next;
1004                                 rt_free(rth);
1005                                 goal--;
1006                         }
1007                         spin_unlock_bh(rt_hash_lock_addr(k));
1008                         if (goal <= 0)
1009                                 break;
1010                 }
1011                 rover = k;
1012
1013                 if (goal <= 0)
1014                         goto work_done;
1015
1016                 /* Goal is not achieved. We stop process if:
1017
1018                    - if expire reduced to zero. Otherwise, expire is halfed.
1019                    - if table is not full.
1020                    - if we are called from interrupt.
1021                    - jiffies check is just fallback/debug loop breaker.
1022                      We will not spin here for long time in any case.
1023                  */
1024
1025                 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027                 if (expire == 0)
1028                         break;
1029
1030                 expire >>= 1;
1031
1032                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033                         goto out;
1034         } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037                 goto out;
1038         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039                 goto out;
1040         net_warn_ratelimited("dst cache overflow\n");
1041         RT_CACHE_STAT_INC(gc_dst_overflow);
1042         return 1;
1043
1044 work_done:
1045         expire += ip_rt_gc_min_interval;
1046         if (expire > ip_rt_gc_timeout ||
1047             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049                 expire = ip_rt_gc_timeout;
1050 out:    return 0;
1051 }
1052
1053 /*
1054  * Returns number of entries in a hash chain that have different hash_inputs
1055  */
1056 static int slow_chain_length(const struct rtable *head)
1057 {
1058         int length = 0;
1059         const struct rtable *rth = head;
1060
1061         while (rth) {
1062                 length += has_noalias(head, rth);
1063                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064         }
1065         return length >> FRACT_BITS;
1066 }
1067
1068 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069                                            struct sk_buff *skb,
1070                                            const void *daddr)
1071 {
1072         struct net_device *dev = dst->dev;
1073         const __be32 *pkey = daddr;
1074         const struct rtable *rt;
1075         struct neighbour *n;
1076
1077         rt = (const struct rtable *) dst;
1078         if (rt->rt_gateway)
1079                 pkey = (const __be32 *) &rt->rt_gateway;
1080         else if (skb)
1081                 pkey = &ip_hdr(skb)->daddr;
1082
1083         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1084         if (n)
1085                 return n;
1086         return neigh_create(&arp_tbl, pkey, dev);
1087 }
1088
1089 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090                                      struct sk_buff *skb, int ifindex)
1091 {
1092         struct rtable   *rth, *cand;
1093         struct rtable __rcu **rthp, **candp;
1094         unsigned long   now;
1095         u32             min_score;
1096         int             chain_length;
1097
1098 restart:
1099         chain_length = 0;
1100         min_score = ~(u32)0;
1101         cand = NULL;
1102         candp = NULL;
1103         now = jiffies;
1104
1105         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106                 /*
1107                  * If we're not caching, just tell the caller we
1108                  * were successful and don't touch the route.  The
1109                  * caller hold the sole reference to the cache entry, and
1110                  * it will be released when the caller is done with it.
1111                  * If we drop it here, the callers have no way to resolve routes
1112                  * when we're not caching.  Instead, just point *rp at rt, so
1113                  * the caller gets a single use out of the route
1114                  * Note that we do rt_free on this new route entry, so that
1115                  * once its refcount hits zero, we are still able to reap it
1116                  * (Thanks Alexey)
1117                  * Note: To avoid expensive rcu stuff for this uncached dst,
1118                  * we set DST_NOCACHE so that dst_release() can free dst without
1119                  * waiting a grace period.
1120                  */
1121
1122                 rt->dst.flags |= DST_NOCACHE;
1123                 goto skip_hashing;
1124         }
1125
1126         rthp = &rt_hash_table[hash].chain;
1127
1128         spin_lock_bh(rt_hash_lock_addr(hash));
1129         while ((rth = rcu_dereference_protected(*rthp,
1130                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131                 if (rt_is_expired(rth)) {
1132                         *rthp = rth->dst.rt_next;
1133                         rt_free(rth);
1134                         continue;
1135                 }
1136                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137                         /* Put it first */
1138                         *rthp = rth->dst.rt_next;
1139                         /*
1140                          * Since lookup is lockfree, the deletion
1141                          * must be visible to another weakly ordered CPU before
1142                          * the insertion at the start of the hash chain.
1143                          */
1144                         rcu_assign_pointer(rth->dst.rt_next,
1145                                            rt_hash_table[hash].chain);
1146                         /*
1147                          * Since lookup is lockfree, the update writes
1148                          * must be ordered for consistency on SMP.
1149                          */
1150                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152                         dst_use(&rth->dst, now);
1153                         spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155                         rt_drop(rt);
1156                         if (skb)
1157                                 skb_dst_set(skb, &rth->dst);
1158                         return rth;
1159                 }
1160
1161                 if (!atomic_read(&rth->dst.__refcnt)) {
1162                         u32 score = rt_score(rth);
1163
1164                         if (score <= min_score) {
1165                                 cand = rth;
1166                                 candp = rthp;
1167                                 min_score = score;
1168                         }
1169                 }
1170
1171                 chain_length++;
1172
1173                 rthp = &rth->dst.rt_next;
1174         }
1175
1176         if (cand) {
1177                 /* ip_rt_gc_elasticity used to be average length of chain
1178                  * length, when exceeded gc becomes really aggressive.
1179                  *
1180                  * The second limit is less certain. At the moment it allows
1181                  * only 2 entries per bucket. We will see.
1182                  */
1183                 if (chain_length > ip_rt_gc_elasticity) {
1184                         *candp = cand->dst.rt_next;
1185                         rt_free(cand);
1186                 }
1187         } else {
1188                 if (chain_length > rt_chain_length_max &&
1189                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190                         struct net *net = dev_net(rt->dst.dev);
1191                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192                         if (!rt_caching(net)) {
1193                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194                                         rt->dst.dev->name, num);
1195                         }
1196                         rt_emergency_hash_rebuild(net);
1197                         spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200                                         ifindex, rt_genid(net));
1201                         goto restart;
1202                 }
1203         }
1204
1205         rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207         /*
1208          * Since lookup is lockfree, we must make sure
1209          * previous writes to rt are committed to memory
1210          * before making rt visible to other CPUS.
1211          */
1212         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214         spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216 skip_hashing:
1217         if (skb)
1218                 skb_dst_set(skb, &rt->dst);
1219         return rt;
1220 }
1221
1222 /*
1223  * Peer allocation may fail only in serious out-of-memory conditions.  However
1224  * we still can generate some output.
1225  * Random ID selection looks a bit dangerous because we have no chances to
1226  * select ID being unique in a reasonable period of time.
1227  * But broken packet identifier may be better than no packet at all.
1228  */
1229 static void ip_select_fb_ident(struct iphdr *iph)
1230 {
1231         static DEFINE_SPINLOCK(ip_fb_id_lock);
1232         static u32 ip_fallback_id;
1233         u32 salt;
1234
1235         spin_lock_bh(&ip_fb_id_lock);
1236         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1237         iph->id = htons(salt & 0xFFFF);
1238         ip_fallback_id = salt;
1239         spin_unlock_bh(&ip_fb_id_lock);
1240 }
1241
1242 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1243 {
1244         struct net *net = dev_net(dst->dev);
1245         struct inet_peer *peer;
1246
1247         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1248         if (peer) {
1249                 iph->id = htons(inet_getid(peer, more));
1250                 inet_putpeer(peer);
1251                 return;
1252         }
1253
1254         ip_select_fb_ident(iph);
1255 }
1256 EXPORT_SYMBOL(__ip_select_ident);
1257
1258 static void rt_del(unsigned int hash, struct rtable *rt)
1259 {
1260         struct rtable __rcu **rthp;
1261         struct rtable *aux;
1262
1263         rthp = &rt_hash_table[hash].chain;
1264         spin_lock_bh(rt_hash_lock_addr(hash));
1265         ip_rt_put(rt);
1266         while ((aux = rcu_dereference_protected(*rthp,
1267                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268                 if (aux == rt || rt_is_expired(aux)) {
1269                         *rthp = aux->dst.rt_next;
1270                         rt_free(aux);
1271                         continue;
1272                 }
1273                 rthp = &aux->dst.rt_next;
1274         }
1275         spin_unlock_bh(rt_hash_lock_addr(hash));
1276 }
1277
1278 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1279                              const struct iphdr *iph,
1280                              int oif, u8 tos,
1281                              u8 prot, u32 mark, int flow_flags)
1282 {
1283         if (sk) {
1284                 const struct inet_sock *inet = inet_sk(sk);
1285
1286                 oif = sk->sk_bound_dev_if;
1287                 mark = sk->sk_mark;
1288                 tos = RT_CONN_FLAGS(sk);
1289                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1290         }
1291         flowi4_init_output(fl4, oif, mark, tos,
1292                            RT_SCOPE_UNIVERSE, prot,
1293                            flow_flags,
1294                            iph->daddr, iph->saddr, 0, 0);
1295 }
1296
1297 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
1298                                const struct sock *sk)
1299 {
1300         const struct iphdr *iph = ip_hdr(skb);
1301         int oif = skb->dev->ifindex;
1302         u8 tos = RT_TOS(iph->tos);
1303         u8 prot = iph->protocol;
1304         u32 mark = skb->mark;
1305
1306         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1307 }
1308
1309 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
1310 {
1311         const struct inet_sock *inet = inet_sk(sk);
1312         const struct ip_options_rcu *inet_opt;
1313         __be32 daddr = inet->inet_daddr;
1314
1315         rcu_read_lock();
1316         inet_opt = rcu_dereference(inet->inet_opt);
1317         if (inet_opt && inet_opt->opt.srr)
1318                 daddr = inet_opt->opt.faddr;
1319         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1320                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1321                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1322                            inet_sk_flowi_flags(sk),
1323                            daddr, inet->inet_saddr, 0, 0);
1324         rcu_read_unlock();
1325 }
1326
1327 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1328                                  const struct sk_buff *skb)
1329 {
1330         if (skb)
1331                 build_skb_flow_key(fl4, skb, sk);
1332         else
1333                 build_sk_flow_key(fl4, sk);
1334 }
1335
1336 static DEFINE_SEQLOCK(fnhe_seqlock);
1337
1338 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
1339 {
1340         struct fib_nh_exception *fnhe, *oldest;
1341
1342         oldest = rcu_dereference(hash->chain);
1343         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1344              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1345                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1346                         oldest = fnhe;
1347         }
1348         return oldest;
1349 }
1350
1351 static inline u32 fnhe_hashfun(__be32 daddr)
1352 {
1353         u32 hval;
1354
1355         hval = (__force u32) daddr;
1356         hval ^= (hval >> 11) ^ (hval >> 22);
1357
1358         return hval & (FNHE_HASH_SIZE - 1);
1359 }
1360
1361 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
1362                                   u32 pmtu, unsigned long expires)
1363 {
1364         struct fnhe_hash_bucket *hash;
1365         struct fib_nh_exception *fnhe;
1366         int depth;
1367         u32 hval = fnhe_hashfun(daddr);
1368
1369         write_seqlock_bh(&fnhe_seqlock);
1370
1371         hash = nh->nh_exceptions;
1372         if (!hash) {
1373                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
1374                 if (!hash)
1375                         goto out_unlock;
1376                 nh->nh_exceptions = hash;
1377         }
1378
1379         hash += hval;
1380
1381         depth = 0;
1382         for (fnhe = rcu_dereference(hash->chain); fnhe;
1383              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1384                 if (fnhe->fnhe_daddr == daddr)
1385                         break;
1386                 depth++;
1387         }
1388
1389         if (fnhe) {
1390                 if (gw)
1391                         fnhe->fnhe_gw = gw;
1392                 if (pmtu) {
1393                         fnhe->fnhe_pmtu = pmtu;
1394                         fnhe->fnhe_expires = expires;
1395                 }
1396         } else {
1397                 if (depth > FNHE_RECLAIM_DEPTH)
1398                         fnhe = fnhe_oldest(hash);
1399                 else {
1400                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1401                         if (!fnhe)
1402                                 goto out_unlock;
1403
1404                         fnhe->fnhe_next = hash->chain;
1405                         rcu_assign_pointer(hash->chain, fnhe);
1406                 }
1407                 fnhe->fnhe_daddr = daddr;
1408                 fnhe->fnhe_gw = gw;
1409                 fnhe->fnhe_pmtu = pmtu;
1410                 fnhe->fnhe_expires = expires;
1411         }
1412
1413         fnhe->fnhe_stamp = jiffies;
1414
1415 out_unlock:
1416         write_sequnlock_bh(&fnhe_seqlock);
1417         return;
1418 }
1419
1420 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
1421 {
1422         __be32 new_gw = icmp_hdr(skb)->un.gateway;
1423         __be32 old_gw = ip_hdr(skb)->saddr;
1424         struct net_device *dev = skb->dev;
1425         struct in_device *in_dev;
1426         struct fib_result res;
1427         struct neighbour *n;
1428         struct net *net;
1429
1430         switch (icmp_hdr(skb)->code & 7) {
1431         case ICMP_REDIR_NET:
1432         case ICMP_REDIR_NETTOS:
1433         case ICMP_REDIR_HOST:
1434         case ICMP_REDIR_HOSTTOS:
1435                 break;
1436
1437         default:
1438                 return;
1439         }
1440
1441         if (rt->rt_gateway != old_gw)
1442                 return;
1443
1444         in_dev = __in_dev_get_rcu(dev);
1445         if (!in_dev)
1446                 return;
1447
1448         net = dev_net(dev);
1449         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451             ipv4_is_zeronet(new_gw))
1452                 goto reject_redirect;
1453
1454         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456                         goto reject_redirect;
1457                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458                         goto reject_redirect;
1459         } else {
1460                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1461                         goto reject_redirect;
1462         }
1463
1464         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1465         if (n) {
1466                 if (!(n->nud_state & NUD_VALID)) {
1467                         neigh_event_send(n, NULL);
1468                 } else {
1469                         if (fib_lookup(net, fl4, &res) == 0) {
1470                                 struct fib_nh *nh = &FIB_RES_NH(res);
1471
1472                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
1473                                                       0, 0);
1474                         }
1475                         rt->rt_gateway = new_gw;
1476                         rt->rt_flags |= RTCF_REDIRECTED;
1477                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1478                 }
1479                 neigh_release(n);
1480         }
1481         return;
1482
1483 reject_redirect:
1484 #ifdef CONFIG_IP_ROUTE_VERBOSE
1485         if (IN_DEV_LOG_MARTIANS(in_dev)) {
1486                 const struct iphdr *iph = (const struct iphdr *) skb->data;
1487                 __be32 daddr = iph->daddr;
1488                 __be32 saddr = iph->saddr;
1489
1490                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1491                                      "  Advised path = %pI4 -> %pI4\n",
1492                                      &old_gw, dev->name, &new_gw,
1493                                      &saddr, &daddr);
1494         }
1495 #endif
1496         ;
1497 }
1498
1499 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1500 {
1501         struct rtable *rt;
1502         struct flowi4 fl4;
1503
1504         rt = (struct rtable *) dst;
1505
1506         ip_rt_build_flow_key(&fl4, sk, skb);
1507         __ip_do_redirect(rt, skb, &fl4);
1508 }
1509
1510 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1511 {
1512         struct rtable *rt = (struct rtable *)dst;
1513         struct dst_entry *ret = dst;
1514
1515         if (rt) {
1516                 if (dst->obsolete > 0) {
1517                         ip_rt_put(rt);
1518                         ret = NULL;
1519                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1520                            rt->dst.expires) {
1521                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1522                                                 rt->rt_oif,
1523                                                 rt_genid(dev_net(dst->dev)));
1524                         rt_del(hash, rt);
1525                         ret = NULL;
1526                 }
1527         }
1528         return ret;
1529 }
1530
1531 /*
1532  * Algorithm:
1533  *      1. The first ip_rt_redirect_number redirects are sent
1534  *         with exponential backoff, then we stop sending them at all,
1535  *         assuming that the host ignores our redirects.
1536  *      2. If we did not see packets requiring redirects
1537  *         during ip_rt_redirect_silence, we assume that the host
1538  *         forgot redirected route and start to send redirects again.
1539  *
1540  * This algorithm is much cheaper and more intelligent than dumb load limiting
1541  * in icmp.c.
1542  *
1543  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1544  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1545  */
1546
1547 void ip_rt_send_redirect(struct sk_buff *skb)
1548 {
1549         struct rtable *rt = skb_rtable(skb);
1550         struct in_device *in_dev;
1551         struct inet_peer *peer;
1552         struct net *net;
1553         int log_martians;
1554
1555         rcu_read_lock();
1556         in_dev = __in_dev_get_rcu(rt->dst.dev);
1557         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1558                 rcu_read_unlock();
1559                 return;
1560         }
1561         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1562         rcu_read_unlock();
1563
1564         net = dev_net(rt->dst.dev);
1565         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1566         if (!peer) {
1567                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1568                 return;
1569         }
1570
1571         /* No redirected packets during ip_rt_redirect_silence;
1572          * reset the algorithm.
1573          */
1574         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1575                 peer->rate_tokens = 0;
1576
1577         /* Too many ignored redirects; do not send anything
1578          * set dst.rate_last to the last seen redirected packet.
1579          */
1580         if (peer->rate_tokens >= ip_rt_redirect_number) {
1581                 peer->rate_last = jiffies;
1582                 goto out_put_peer;
1583         }
1584
1585         /* Check for load limit; set rate_last to the latest sent
1586          * redirect.
1587          */
1588         if (peer->rate_tokens == 0 ||
1589             time_after(jiffies,
1590                        (peer->rate_last +
1591                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1592                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1593                 peer->rate_last = jiffies;
1594                 ++peer->rate_tokens;
1595 #ifdef CONFIG_IP_ROUTE_VERBOSE
1596                 if (log_martians &&
1597                     peer->rate_tokens == ip_rt_redirect_number)
1598                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1599                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1600                                              &rt->rt_dst, &rt->rt_gateway);
1601 #endif
1602         }
1603 out_put_peer:
1604         inet_putpeer(peer);
1605 }
1606
1607 static int ip_error(struct sk_buff *skb)
1608 {
1609         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1610         struct rtable *rt = skb_rtable(skb);
1611         struct inet_peer *peer;
1612         unsigned long now;
1613         struct net *net;
1614         bool send;
1615         int code;
1616
1617         net = dev_net(rt->dst.dev);
1618         if (!IN_DEV_FORWARD(in_dev)) {
1619                 switch (rt->dst.error) {
1620                 case EHOSTUNREACH:
1621                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1622                         break;
1623
1624                 case ENETUNREACH:
1625                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1626                         break;
1627                 }
1628                 goto out;
1629         }
1630
1631         switch (rt->dst.error) {
1632         case EINVAL:
1633         default:
1634                 goto out;
1635         case EHOSTUNREACH:
1636                 code = ICMP_HOST_UNREACH;
1637                 break;
1638         case ENETUNREACH:
1639                 code = ICMP_NET_UNREACH;
1640                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1641                 break;
1642         case EACCES:
1643                 code = ICMP_PKT_FILTERED;
1644                 break;
1645         }
1646
1647         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1648
1649         send = true;
1650         if (peer) {
1651                 now = jiffies;
1652                 peer->rate_tokens += now - peer->rate_last;
1653                 if (peer->rate_tokens > ip_rt_error_burst)
1654                         peer->rate_tokens = ip_rt_error_burst;
1655                 peer->rate_last = now;
1656                 if (peer->rate_tokens >= ip_rt_error_cost)
1657                         peer->rate_tokens -= ip_rt_error_cost;
1658                 else
1659                         send = false;
1660                 inet_putpeer(peer);
1661         }
1662         if (send)
1663                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1664
1665 out:    kfree_skb(skb);
1666         return 0;
1667 }
1668
1669 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1670 {
1671         struct fib_result res;
1672
1673         if (mtu < ip_rt_min_pmtu)
1674                 mtu = ip_rt_min_pmtu;
1675
1676         if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1677                 struct fib_nh *nh = &FIB_RES_NH(res);
1678
1679                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1680                                       jiffies + ip_rt_mtu_expires);
1681         }
1682         rt->rt_pmtu = mtu;
1683         dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1684 }
1685
1686 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1687                               struct sk_buff *skb, u32 mtu)
1688 {
1689         struct rtable *rt = (struct rtable *) dst;
1690         struct flowi4 fl4;
1691
1692         ip_rt_build_flow_key(&fl4, sk, skb);
1693         __ip_rt_update_pmtu(rt, &fl4, mtu);
1694 }
1695
1696 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1697                       int oif, u32 mark, u8 protocol, int flow_flags)
1698 {
1699         const struct iphdr *iph = (const struct iphdr *) skb->data;
1700         struct flowi4 fl4;
1701         struct rtable *rt;
1702
1703         __build_flow_key(&fl4, NULL, iph, oif,
1704                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1705         rt = __ip_route_output_key(net, &fl4);
1706         if (!IS_ERR(rt)) {
1707                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1708                 ip_rt_put(rt);
1709         }
1710 }
1711 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1712
1713 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1714 {
1715         const struct iphdr *iph = (const struct iphdr *) skb->data;
1716         struct flowi4 fl4;
1717         struct rtable *rt;
1718
1719         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1720         rt = __ip_route_output_key(sock_net(sk), &fl4);
1721         if (!IS_ERR(rt)) {
1722                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1723                 ip_rt_put(rt);
1724         }
1725 }
1726 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1727
1728 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1729                    int oif, u32 mark, u8 protocol, int flow_flags)
1730 {
1731         const struct iphdr *iph = (const struct iphdr *) skb->data;
1732         struct flowi4 fl4;
1733         struct rtable *rt;
1734
1735         __build_flow_key(&fl4, NULL, iph, oif,
1736                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1737         rt = __ip_route_output_key(net, &fl4);
1738         if (!IS_ERR(rt)) {
1739                 __ip_do_redirect(rt, skb, &fl4);
1740                 ip_rt_put(rt);
1741         }
1742 }
1743 EXPORT_SYMBOL_GPL(ipv4_redirect);
1744
1745 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1746 {
1747         const struct iphdr *iph = (const struct iphdr *) skb->data;
1748         struct flowi4 fl4;
1749         struct rtable *rt;
1750
1751         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1752         rt = __ip_route_output_key(sock_net(sk), &fl4);
1753         if (!IS_ERR(rt)) {
1754                 __ip_do_redirect(rt, skb, &fl4);
1755                 ip_rt_put(rt);
1756         }
1757 }
1758 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1759
1760 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1761 {
1762         struct rtable *rt = (struct rtable *) dst;
1763
1764         if (rt_is_expired(rt))
1765                 return NULL;
1766         return dst;
1767 }
1768
1769 static void ipv4_dst_destroy(struct dst_entry *dst)
1770 {
1771         struct rtable *rt = (struct rtable *) dst;
1772
1773         if (rt->fi) {
1774                 fib_info_put(rt->fi);
1775                 rt->fi = NULL;
1776         }
1777 }
1778
1779
1780 static void ipv4_link_failure(struct sk_buff *skb)
1781 {
1782         struct rtable *rt;
1783
1784         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1785
1786         rt = skb_rtable(skb);
1787         if (rt)
1788                 dst_set_expires(&rt->dst, 0);
1789 }
1790
1791 static int ip_rt_bug(struct sk_buff *skb)
1792 {
1793         pr_debug("%s: %pI4 -> %pI4, %s\n",
1794                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1795                  skb->dev ? skb->dev->name : "?");
1796         kfree_skb(skb);
1797         WARN_ON(1);
1798         return 0;
1799 }
1800
1801 /*
1802    We do not cache source address of outgoing interface,
1803    because it is used only by IP RR, TS and SRR options,
1804    so that it out of fast path.
1805
1806    BTW remember: "addr" is allowed to be not aligned
1807    in IP options!
1808  */
1809
1810 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1811 {
1812         __be32 src;
1813
1814         if (rt_is_output_route(rt))
1815                 src = ip_hdr(skb)->saddr;
1816         else {
1817                 struct fib_result res;
1818                 struct flowi4 fl4;
1819                 struct iphdr *iph;
1820
1821                 iph = ip_hdr(skb);
1822
1823                 memset(&fl4, 0, sizeof(fl4));
1824                 fl4.daddr = iph->daddr;
1825                 fl4.saddr = iph->saddr;
1826                 fl4.flowi4_tos = RT_TOS(iph->tos);
1827                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1828                 fl4.flowi4_iif = skb->dev->ifindex;
1829                 fl4.flowi4_mark = skb->mark;
1830
1831                 rcu_read_lock();
1832                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1833                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1834                 else
1835                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1836                                         RT_SCOPE_UNIVERSE);
1837                 rcu_read_unlock();
1838         }
1839         memcpy(addr, &src, 4);
1840 }
1841
1842 #ifdef CONFIG_IP_ROUTE_CLASSID
1843 static void set_class_tag(struct rtable *rt, u32 tag)
1844 {
1845         if (!(rt->dst.tclassid & 0xFFFF))
1846                 rt->dst.tclassid |= tag & 0xFFFF;
1847         if (!(rt->dst.tclassid & 0xFFFF0000))
1848                 rt->dst.tclassid |= tag & 0xFFFF0000;
1849 }
1850 #endif
1851
1852 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1853 {
1854         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1855
1856         if (advmss == 0) {
1857                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1858                                ip_rt_min_advmss);
1859                 if (advmss > 65535 - 40)
1860                         advmss = 65535 - 40;
1861         }
1862         return advmss;
1863 }
1864
1865 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1866 {
1867         const struct rtable *rt = (const struct rtable *) dst;
1868         unsigned int mtu = rt->rt_pmtu;
1869
1870         if (mtu && time_after_eq(jiffies, rt->dst.expires))
1871                 mtu = 0;
1872
1873         if (!mtu)
1874                 mtu = dst_metric_raw(dst, RTAX_MTU);
1875
1876         if (mtu && rt_is_output_route(rt))
1877                 return mtu;
1878
1879         mtu = dst->dev->mtu;
1880
1881         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1882
1883                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1884                         mtu = 576;
1885         }
1886
1887         if (mtu > IP_MAX_MTU)
1888                 mtu = IP_MAX_MTU;
1889
1890         return mtu;
1891 }
1892
1893 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1894                             struct fib_info *fi)
1895 {
1896         if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1897                 rt->fi = fi;
1898                 atomic_inc(&fi->fib_clntref);
1899         }
1900         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1901 }
1902
1903 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1904 {
1905         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1906         struct fib_nh_exception *fnhe;
1907         u32 hval;
1908
1909         hval = fnhe_hashfun(daddr);
1910
1911 restart:
1912         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1913              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1914                 __be32 fnhe_daddr, gw;
1915                 unsigned long expires;
1916                 unsigned int seq;
1917                 u32 pmtu;
1918
1919                 seq = read_seqbegin(&fnhe_seqlock);
1920                 fnhe_daddr = fnhe->fnhe_daddr;
1921                 gw = fnhe->fnhe_gw;
1922                 pmtu = fnhe->fnhe_pmtu;
1923                 expires = fnhe->fnhe_expires;
1924                 if (read_seqretry(&fnhe_seqlock, seq))
1925                         goto restart;
1926                 if (daddr != fnhe_daddr)
1927                         continue;
1928                 if (pmtu) {
1929                         unsigned long diff = jiffies - expires;
1930
1931                         if (time_before(jiffies, expires)) {
1932                                 rt->rt_pmtu = pmtu;
1933                                 dst_set_expires(&rt->dst, diff);
1934                         }
1935                 }
1936                 if (gw)
1937                         rt->rt_gateway = gw;
1938                 fnhe->fnhe_stamp = jiffies;
1939                 break;
1940         }
1941 }
1942
1943 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1944                            const struct fib_result *res,
1945                            struct fib_info *fi, u16 type, u32 itag)
1946 {
1947         if (fi) {
1948                 struct fib_nh *nh = &FIB_RES_NH(*res);
1949
1950                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1951                         rt->rt_gateway = nh->nh_gw;
1952                 if (unlikely(nh->nh_exceptions))
1953                         rt_bind_exception(rt, nh, fl4->daddr);
1954                 rt_init_metrics(rt, fl4, fi);
1955 #ifdef CONFIG_IP_ROUTE_CLASSID
1956                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1957 #endif
1958         }
1959
1960 #ifdef CONFIG_IP_ROUTE_CLASSID
1961 #ifdef CONFIG_IP_MULTIPLE_TABLES
1962         set_class_tag(rt, res->tclassid);
1963 #endif
1964         set_class_tag(rt, itag);
1965 #endif
1966 }
1967
1968 static struct rtable *rt_dst_alloc(struct net_device *dev,
1969                                    bool nopolicy, bool noxfrm)
1970 {
1971         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1972                          DST_HOST |
1973                          (nopolicy ? DST_NOPOLICY : 0) |
1974                          (noxfrm ? DST_NOXFRM : 0));
1975 }
1976
1977 /* called in rcu_read_lock() section */
1978 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1979                                 u8 tos, struct net_device *dev, int our)
1980 {
1981         unsigned int hash;
1982         struct rtable *rth;
1983         struct in_device *in_dev = __in_dev_get_rcu(dev);
1984         u32 itag = 0;
1985         int err;
1986
1987         /* Primary sanity checks. */
1988
1989         if (in_dev == NULL)
1990                 return -EINVAL;
1991
1992         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1993             skb->protocol != htons(ETH_P_IP))
1994                 goto e_inval;
1995
1996         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1997                 if (ipv4_is_loopback(saddr))
1998                         goto e_inval;
1999
2000         if (ipv4_is_zeronet(saddr)) {
2001                 if (!ipv4_is_local_multicast(daddr))
2002                         goto e_inval;
2003         } else {
2004                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2005                                           in_dev, &itag);
2006                 if (err < 0)
2007                         goto e_err;
2008         }
2009         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2010                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2011         if (!rth)
2012                 goto e_nobufs;
2013
2014 #ifdef CONFIG_IP_ROUTE_CLASSID
2015         rth->dst.tclassid = itag;
2016 #endif
2017         rth->dst.output = ip_rt_bug;
2018
2019         rth->rt_key_dst = daddr;
2020         rth->rt_key_src = saddr;
2021         rth->rt_genid   = rt_genid(dev_net(dev));
2022         rth->rt_flags   = RTCF_MULTICAST;
2023         rth->rt_type    = RTN_MULTICAST;
2024         rth->rt_key_tos = tos;
2025         rth->rt_dst     = daddr;
2026         rth->rt_src     = saddr;
2027         rth->rt_route_iif = dev->ifindex;
2028         rth->rt_iif     = dev->ifindex;
2029         rth->rt_oif     = 0;
2030         rth->rt_mark    = skb->mark;
2031         rth->rt_pmtu    = 0;
2032         rth->rt_gateway = daddr;
2033         rth->fi = NULL;
2034         if (our) {
2035                 rth->dst.input= ip_local_deliver;
2036                 rth->rt_flags |= RTCF_LOCAL;
2037         }
2038
2039 #ifdef CONFIG_IP_MROUTE
2040         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2041                 rth->dst.input = ip_mr_input;
2042 #endif
2043         RT_CACHE_STAT_INC(in_slow_mc);
2044
2045         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2046         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2047         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2048
2049 e_nobufs:
2050         return -ENOBUFS;
2051 e_inval:
2052         return -EINVAL;
2053 e_err:
2054         return err;
2055 }
2056
2057
2058 static void ip_handle_martian_source(struct net_device *dev,
2059                                      struct in_device *in_dev,
2060                                      struct sk_buff *skb,
2061                                      __be32 daddr,
2062                                      __be32 saddr)
2063 {
2064         RT_CACHE_STAT_INC(in_martian_src);
2065 #ifdef CONFIG_IP_ROUTE_VERBOSE
2066         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2067                 /*
2068                  *      RFC1812 recommendation, if source is martian,
2069                  *      the only hint is MAC header.
2070                  */
2071                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2072                         &daddr, &saddr, dev->name);
2073                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2074                         print_hex_dump(KERN_WARNING, "ll header: ",
2075                                        DUMP_PREFIX_OFFSET, 16, 1,
2076                                        skb_mac_header(skb),
2077                                        dev->hard_header_len, true);
2078                 }
2079         }
2080 #endif
2081 }
2082
2083 /* called in rcu_read_lock() section */
2084 static int __mkroute_input(struct sk_buff *skb,
2085                            const struct fib_result *res,
2086                            struct in_device *in_dev,
2087                            __be32 daddr, __be32 saddr, u32 tos,
2088                            struct rtable **result)
2089 {
2090         struct rtable *rth;
2091         int err;
2092         struct in_device *out_dev;
2093         unsigned int flags = 0;
2094         u32 itag;
2095
2096         /* get a working reference to the output device */
2097         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2098         if (out_dev == NULL) {
2099                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2100                 return -EINVAL;
2101         }
2102
2103
2104         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2105                                   in_dev->dev, in_dev, &itag);
2106         if (err < 0) {
2107                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2108                                          saddr);
2109
2110                 goto cleanup;
2111         }
2112
2113         if (err)
2114                 flags |= RTCF_DIRECTSRC;
2115
2116         if (out_dev == in_dev && err &&
2117             (IN_DEV_SHARED_MEDIA(out_dev) ||
2118              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2119                 flags |= RTCF_DOREDIRECT;
2120
2121         if (skb->protocol != htons(ETH_P_IP)) {
2122                 /* Not IP (i.e. ARP). Do not create route, if it is
2123                  * invalid for proxy arp. DNAT routes are always valid.
2124                  *
2125                  * Proxy arp feature have been extended to allow, ARP
2126                  * replies back to the same interface, to support
2127                  * Private VLAN switch technologies. See arp.c.
2128                  */
2129                 if (out_dev == in_dev &&
2130                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2131                         err = -EINVAL;
2132                         goto cleanup;
2133                 }
2134         }
2135
2136         rth = rt_dst_alloc(out_dev->dev,
2137                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2138                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2139         if (!rth) {
2140                 err = -ENOBUFS;
2141                 goto cleanup;
2142         }
2143
2144         rth->rt_key_dst = daddr;
2145         rth->rt_key_src = saddr;
2146         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2147         rth->rt_flags = flags;
2148         rth->rt_type = res->type;
2149         rth->rt_key_tos = tos;
2150         rth->rt_dst     = daddr;
2151         rth->rt_src     = saddr;
2152         rth->rt_route_iif = in_dev->dev->ifindex;
2153         rth->rt_iif     = in_dev->dev->ifindex;
2154         rth->rt_oif     = 0;
2155         rth->rt_mark    = skb->mark;
2156         rth->rt_pmtu    = 0;
2157         rth->rt_gateway = daddr;
2158         rth->fi = NULL;
2159
2160         rth->dst.input = ip_forward;
2161         rth->dst.output = ip_output;
2162
2163         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2164
2165         *result = rth;
2166         err = 0;
2167  cleanup:
2168         return err;
2169 }
2170
2171 static int ip_mkroute_input(struct sk_buff *skb,
2172                             struct fib_result *res,
2173                             const struct flowi4 *fl4,
2174                             struct in_device *in_dev,
2175                             __be32 daddr, __be32 saddr, u32 tos)
2176 {
2177         struct rtable *rth = NULL;
2178         int err;
2179         unsigned int hash;
2180
2181 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2182         if (res->fi && res->fi->fib_nhs > 1)
2183                 fib_select_multipath(res);
2184 #endif
2185
2186         /* create a routing cache entry */
2187         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2188         if (err)
2189                 return err;
2190
2191         /* put it into the cache */
2192         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2193                        rt_genid(dev_net(rth->dst.dev)));
2194         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2195         if (IS_ERR(rth))
2196                 return PTR_ERR(rth);
2197         return 0;
2198 }
2199
2200 /*
2201  *      NOTE. We drop all the packets that has local source
2202  *      addresses, because every properly looped back packet
2203  *      must have correct destination already attached by output routine.
2204  *
2205  *      Such approach solves two big problems:
2206  *      1. Not simplex devices are handled properly.
2207  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2208  *      called with rcu_read_lock()
2209  */
2210
2211 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2212                                u8 tos, struct net_device *dev)
2213 {
2214         struct fib_result res;
2215         struct in_device *in_dev = __in_dev_get_rcu(dev);
2216         struct flowi4   fl4;
2217         unsigned int    flags = 0;
2218         u32             itag = 0;
2219         struct rtable   *rth;
2220         unsigned int    hash;
2221         int             err = -EINVAL;
2222         struct net    *net = dev_net(dev);
2223
2224         /* IP on this device is disabled. */
2225
2226         if (!in_dev)
2227                 goto out;
2228
2229         /* Check for the most weird martians, which can be not detected
2230            by fib_lookup.
2231          */
2232
2233         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2234                 goto martian_source;
2235
2236         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2237                 goto brd_input;
2238
2239         /* Accept zero addresses only to limited broadcast;
2240          * I even do not know to fix it or not. Waiting for complains :-)
2241          */
2242         if (ipv4_is_zeronet(saddr))
2243                 goto martian_source;
2244
2245         if (ipv4_is_zeronet(daddr))
2246                 goto martian_destination;
2247
2248         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2249                 if (ipv4_is_loopback(daddr))
2250                         goto martian_destination;
2251
2252                 if (ipv4_is_loopback(saddr))
2253                         goto martian_source;
2254         }
2255
2256         /*
2257          *      Now we are ready to route packet.
2258          */
2259         fl4.flowi4_oif = 0;
2260         fl4.flowi4_iif = dev->ifindex;
2261         fl4.flowi4_mark = skb->mark;
2262         fl4.flowi4_tos = tos;
2263         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2264         fl4.daddr = daddr;
2265         fl4.saddr = saddr;
2266         err = fib_lookup(net, &fl4, &res);
2267         if (err != 0)
2268                 goto no_route;
2269
2270         RT_CACHE_STAT_INC(in_slow_tot);
2271
2272         if (res.type == RTN_BROADCAST)
2273                 goto brd_input;
2274
2275         if (res.type == RTN_LOCAL) {
2276                 err = fib_validate_source(skb, saddr, daddr, tos,
2277                                           net->loopback_dev->ifindex,
2278                                           dev, in_dev, &itag);
2279                 if (err < 0)
2280                         goto martian_source_keep_err;
2281                 if (err)
2282                         flags |= RTCF_DIRECTSRC;
2283                 goto local_input;
2284         }
2285
2286         if (!IN_DEV_FORWARD(in_dev))
2287                 goto no_route;
2288         if (res.type != RTN_UNICAST)
2289                 goto martian_destination;
2290
2291         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2292 out:    return err;
2293
2294 brd_input:
2295         if (skb->protocol != htons(ETH_P_IP))
2296                 goto e_inval;
2297
2298         if (!ipv4_is_zeronet(saddr)) {
2299                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2300                                           in_dev, &itag);
2301                 if (err < 0)
2302                         goto martian_source_keep_err;
2303                 if (err)
2304                         flags |= RTCF_DIRECTSRC;
2305         }
2306         flags |= RTCF_BROADCAST;
2307         res.type = RTN_BROADCAST;
2308         RT_CACHE_STAT_INC(in_brd);
2309
2310 local_input:
2311         rth = rt_dst_alloc(net->loopback_dev,
2312                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2313         if (!rth)
2314                 goto e_nobufs;
2315
2316         rth->dst.input= ip_local_deliver;
2317         rth->dst.output= ip_rt_bug;
2318 #ifdef CONFIG_IP_ROUTE_CLASSID
2319         rth->dst.tclassid = itag;
2320 #endif
2321
2322         rth->rt_key_dst = daddr;
2323         rth->rt_key_src = saddr;
2324         rth->rt_genid = rt_genid(net);
2325         rth->rt_flags   = flags|RTCF_LOCAL;
2326         rth->rt_type    = res.type;
2327         rth->rt_key_tos = tos;
2328         rth->rt_dst     = daddr;
2329         rth->rt_src     = saddr;
2330         rth->rt_route_iif = dev->ifindex;
2331         rth->rt_iif     = dev->ifindex;
2332         rth->rt_oif     = 0;
2333         rth->rt_mark    = skb->mark;
2334         rth->rt_pmtu    = 0;
2335         rth->rt_gateway = daddr;
2336         rth->fi = NULL;
2337         if (res.type == RTN_UNREACHABLE) {
2338                 rth->dst.input= ip_error;
2339                 rth->dst.error= -err;
2340                 rth->rt_flags   &= ~RTCF_LOCAL;
2341         }
2342         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2343         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2344         err = 0;
2345         if (IS_ERR(rth))
2346                 err = PTR_ERR(rth);
2347         goto out;
2348
2349 no_route:
2350         RT_CACHE_STAT_INC(in_no_route);
2351         res.type = RTN_UNREACHABLE;
2352         if (err == -ESRCH)
2353                 err = -ENETUNREACH;
2354         goto local_input;
2355
2356         /*
2357          *      Do not cache martian addresses: they should be logged (RFC1812)
2358          */
2359 martian_destination:
2360         RT_CACHE_STAT_INC(in_martian_dst);
2361 #ifdef CONFIG_IP_ROUTE_VERBOSE
2362         if (IN_DEV_LOG_MARTIANS(in_dev))
2363                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2364                                      &daddr, &saddr, dev->name);
2365 #endif
2366
2367 e_inval:
2368         err = -EINVAL;
2369         goto out;
2370
2371 e_nobufs:
2372         err = -ENOBUFS;
2373         goto out;
2374
2375 martian_source:
2376         err = -EINVAL;
2377 martian_source_keep_err:
2378         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2379         goto out;
2380 }
2381
2382 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2383                            u8 tos, struct net_device *dev, bool noref)
2384 {
2385         struct rtable   *rth;
2386         unsigned int    hash;
2387         int iif = dev->ifindex;
2388         struct net *net;
2389         int res;
2390
2391         net = dev_net(dev);
2392
2393         rcu_read_lock();
2394
2395         if (!rt_caching(net))
2396                 goto skip_cache;
2397
2398         tos &= IPTOS_RT_MASK;
2399         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2400
2401         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2402              rth = rcu_dereference(rth->dst.rt_next)) {
2403                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2404                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2405                      (rth->rt_route_iif ^ iif) |
2406                      (rth->rt_key_tos ^ tos)) == 0 &&
2407                     rth->rt_mark == skb->mark &&
2408                     net_eq(dev_net(rth->dst.dev), net) &&
2409                     !rt_is_expired(rth)) {
2410                         if (noref) {
2411                                 dst_use_noref(&rth->dst, jiffies);
2412                                 skb_dst_set_noref(skb, &rth->dst);
2413                         } else {
2414                                 dst_use(&rth->dst, jiffies);
2415                                 skb_dst_set(skb, &rth->dst);
2416                         }
2417                         RT_CACHE_STAT_INC(in_hit);
2418                         rcu_read_unlock();
2419                         return 0;
2420                 }
2421                 RT_CACHE_STAT_INC(in_hlist_search);
2422         }
2423
2424 skip_cache:
2425         /* Multicast recognition logic is moved from route cache to here.
2426            The problem was that too many Ethernet cards have broken/missing
2427            hardware multicast filters :-( As result the host on multicasting
2428            network acquires a lot of useless route cache entries, sort of
2429            SDR messages from all the world. Now we try to get rid of them.
2430            Really, provided software IP multicast filter is organized
2431            reasonably (at least, hashed), it does not result in a slowdown
2432            comparing with route cache reject entries.
2433            Note, that multicast routers are not affected, because
2434            route cache entry is created eventually.
2435          */
2436         if (ipv4_is_multicast(daddr)) {
2437                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2438
2439                 if (in_dev) {
2440                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2441                                                   ip_hdr(skb)->protocol);
2442                         if (our
2443 #ifdef CONFIG_IP_MROUTE
2444                                 ||
2445                             (!ipv4_is_local_multicast(daddr) &&
2446                              IN_DEV_MFORWARD(in_dev))
2447 #endif
2448                            ) {
2449                                 int res = ip_route_input_mc(skb, daddr, saddr,
2450                                                             tos, dev, our);
2451                                 rcu_read_unlock();
2452                                 return res;
2453                         }
2454                 }
2455                 rcu_read_unlock();
2456                 return -EINVAL;
2457         }
2458         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2459         rcu_read_unlock();
2460         return res;
2461 }
2462 EXPORT_SYMBOL(ip_route_input_common);
2463
2464 /* called with rcu_read_lock() */
2465 static struct rtable *__mkroute_output(const struct fib_result *res,
2466                                        const struct flowi4 *fl4,
2467                                        __be32 orig_daddr, __be32 orig_saddr,
2468                                        int orig_oif, __u8 orig_rtos,
2469                                        struct net_device *dev_out,
2470                                        unsigned int flags)
2471 {
2472         struct fib_info *fi = res->fi;
2473         struct in_device *in_dev;
2474         u16 type = res->type;
2475         struct rtable *rth;
2476
2477         in_dev = __in_dev_get_rcu(dev_out);
2478         if (!in_dev)
2479                 return ERR_PTR(-EINVAL);
2480
2481         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2482                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2483                         return ERR_PTR(-EINVAL);
2484
2485         if (ipv4_is_lbcast(fl4->daddr))
2486                 type = RTN_BROADCAST;
2487         else if (ipv4_is_multicast(fl4->daddr))
2488                 type = RTN_MULTICAST;
2489         else if (ipv4_is_zeronet(fl4->daddr))
2490                 return ERR_PTR(-EINVAL);
2491
2492         if (dev_out->flags & IFF_LOOPBACK)
2493                 flags |= RTCF_LOCAL;
2494
2495         if (type == RTN_BROADCAST) {
2496                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2497                 fi = NULL;
2498         } else if (type == RTN_MULTICAST) {
2499                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2500                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2501                                      fl4->flowi4_proto))
2502                         flags &= ~RTCF_LOCAL;
2503                 /* If multicast route do not exist use
2504                  * default one, but do not gateway in this case.
2505                  * Yes, it is hack.
2506                  */
2507                 if (fi && res->prefixlen < 4)
2508                         fi = NULL;
2509         }
2510
2511         rth = rt_dst_alloc(dev_out,
2512                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2513                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2514         if (!rth)
2515                 return ERR_PTR(-ENOBUFS);
2516
2517         rth->dst.output = ip_output;
2518
2519         rth->rt_key_dst = orig_daddr;
2520         rth->rt_key_src = orig_saddr;
2521         rth->rt_genid = rt_genid(dev_net(dev_out));
2522         rth->rt_flags   = flags;
2523         rth->rt_type    = type;
2524         rth->rt_key_tos = orig_rtos;
2525         rth->rt_dst     = fl4->daddr;
2526         rth->rt_src     = fl4->saddr;
2527         rth->rt_route_iif = 0;
2528         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2529         rth->rt_oif     = orig_oif;
2530         rth->rt_mark    = fl4->flowi4_mark;
2531         rth->rt_pmtu    = 0;
2532         rth->rt_gateway = fl4->daddr;
2533         rth->fi = NULL;
2534
2535         RT_CACHE_STAT_INC(out_slow_tot);
2536
2537         if (flags & RTCF_LOCAL)
2538                 rth->dst.input = ip_local_deliver;
2539         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2540                 if (flags & RTCF_LOCAL &&
2541                     !(dev_out->flags & IFF_LOOPBACK)) {
2542                         rth->dst.output = ip_mc_output;
2543                         RT_CACHE_STAT_INC(out_slow_mc);
2544                 }
2545 #ifdef CONFIG_IP_MROUTE
2546                 if (type == RTN_MULTICAST) {
2547                         if (IN_DEV_MFORWARD(in_dev) &&
2548                             !ipv4_is_local_multicast(fl4->daddr)) {
2549                                 rth->dst.input = ip_mr_input;
2550                                 rth->dst.output = ip_mc_output;
2551                         }
2552                 }
2553 #endif
2554         }
2555
2556         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2557
2558         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2559                 rth->dst.flags |= DST_NOCACHE;
2560
2561         return rth;
2562 }
2563
2564 /*
2565  * Major route resolver routine.
2566  * called with rcu_read_lock();
2567  */
2568
2569 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2570 {
2571         struct net_device *dev_out = NULL;
2572         __u8 tos = RT_FL_TOS(fl4);
2573         unsigned int flags = 0;
2574         struct fib_result res;
2575         struct rtable *rth;
2576         __be32 orig_daddr;
2577         __be32 orig_saddr;
2578         int orig_oif;
2579
2580         res.tclassid    = 0;
2581         res.fi          = NULL;
2582         res.table       = NULL;
2583
2584         orig_daddr = fl4->daddr;
2585         orig_saddr = fl4->saddr;
2586         orig_oif = fl4->flowi4_oif;
2587
2588         fl4->flowi4_iif = net->loopback_dev->ifindex;
2589         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2590         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2591                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2592
2593         rcu_read_lock();
2594         if (fl4->saddr) {
2595                 rth = ERR_PTR(-EINVAL);
2596                 if (ipv4_is_multicast(fl4->saddr) ||
2597                     ipv4_is_lbcast(fl4->saddr) ||
2598                     ipv4_is_zeronet(fl4->saddr))
2599                         goto out;
2600
2601                 /* I removed check for oif == dev_out->oif here.
2602                    It was wrong for two reasons:
2603                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2604                       is assigned to multiple interfaces.
2605                    2. Moreover, we are allowed to send packets with saddr
2606                       of another iface. --ANK
2607                  */
2608
2609                 if (fl4->flowi4_oif == 0 &&
2610                     (ipv4_is_multicast(fl4->daddr) ||
2611                      ipv4_is_lbcast(fl4->daddr))) {
2612                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2613                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2614                         if (dev_out == NULL)
2615                                 goto out;
2616
2617                         /* Special hack: user can direct multicasts
2618                            and limited broadcast via necessary interface
2619                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2620                            This hack is not just for fun, it allows
2621                            vic,vat and friends to work.
2622                            They bind socket to loopback, set ttl to zero
2623                            and expect that it will work.
2624                            From the viewpoint of routing cache they are broken,
2625                            because we are not allowed to build multicast path
2626                            with loopback source addr (look, routing cache
2627                            cannot know, that ttl is zero, so that packet
2628                            will not leave this host and route is valid).
2629                            Luckily, this hack is good workaround.
2630                          */
2631
2632                         fl4->flowi4_oif = dev_out->ifindex;
2633                         goto make_route;
2634                 }
2635
2636                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2637                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2638                         if (!__ip_dev_find(net, fl4->saddr, false))
2639                                 goto out;
2640                 }
2641         }
2642
2643
2644         if (fl4->flowi4_oif) {
2645                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2646                 rth = ERR_PTR(-ENODEV);
2647                 if (dev_out == NULL)
2648                         goto out;
2649
2650                 /* RACE: Check return value of inet_select_addr instead. */
2651                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2652                         rth = ERR_PTR(-ENETUNREACH);
2653                         goto out;
2654                 }
2655                 if (ipv4_is_local_multicast(fl4->daddr) ||
2656                     ipv4_is_lbcast(fl4->daddr)) {
2657                         if (!fl4->saddr)
2658                                 fl4->saddr = inet_select_addr(dev_out, 0,
2659                                                               RT_SCOPE_LINK);
2660                         goto make_route;
2661                 }
2662                 if (fl4->saddr) {
2663                         if (ipv4_is_multicast(fl4->daddr))
2664                                 fl4->saddr = inet_select_addr(dev_out, 0,
2665                                                               fl4->flowi4_scope);
2666                         else if (!fl4->daddr)
2667                                 fl4->saddr = inet_select_addr(dev_out, 0,
2668                                                               RT_SCOPE_HOST);
2669                 }
2670         }
2671
2672         if (!fl4->daddr) {
2673                 fl4->daddr = fl4->saddr;
2674                 if (!fl4->daddr)
2675                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2676                 dev_out = net->loopback_dev;
2677                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2678                 res.type = RTN_LOCAL;
2679                 flags |= RTCF_LOCAL;
2680                 goto make_route;
2681         }
2682
2683         if (fib_lookup(net, fl4, &res)) {
2684                 res.fi = NULL;
2685                 res.table = NULL;
2686                 if (fl4->flowi4_oif) {
2687                         /* Apparently, routing tables are wrong. Assume,
2688                            that the destination is on link.
2689
2690                            WHY? DW.
2691                            Because we are allowed to send to iface
2692                            even if it has NO routes and NO assigned
2693                            addresses. When oif is specified, routing
2694                            tables are looked up with only one purpose:
2695                            to catch if destination is gatewayed, rather than
2696                            direct. Moreover, if MSG_DONTROUTE is set,
2697                            we send packet, ignoring both routing tables
2698                            and ifaddr state. --ANK
2699
2700
2701                            We could make it even if oif is unknown,
2702                            likely IPv6, but we do not.
2703                          */
2704
2705                         if (fl4->saddr == 0)
2706                                 fl4->saddr = inet_select_addr(dev_out, 0,
2707                                                               RT_SCOPE_LINK);
2708                         res.type = RTN_UNICAST;
2709                         goto make_route;
2710                 }
2711                 rth = ERR_PTR(-ENETUNREACH);
2712                 goto out;
2713         }
2714
2715         if (res.type == RTN_LOCAL) {
2716                 if (!fl4->saddr) {
2717                         if (res.fi->fib_prefsrc)
2718                                 fl4->saddr = res.fi->fib_prefsrc;
2719                         else
2720                                 fl4->saddr = fl4->daddr;
2721                 }
2722                 dev_out = net->loopback_dev;
2723                 fl4->flowi4_oif = dev_out->ifindex;
2724                 res.fi = NULL;
2725                 flags |= RTCF_LOCAL;
2726                 goto make_route;
2727         }
2728
2729 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2730         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2731                 fib_select_multipath(&res);
2732         else
2733 #endif
2734         if (!res.prefixlen &&
2735             res.table->tb_num_default > 1 &&
2736             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2737                 fib_select_default(&res);
2738
2739         if (!fl4->saddr)
2740                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2741
2742         dev_out = FIB_RES_DEV(res);
2743         fl4->flowi4_oif = dev_out->ifindex;
2744
2745
2746 make_route:
2747         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2748                                tos, dev_out, flags);
2749         if (!IS_ERR(rth)) {
2750                 unsigned int hash;
2751
2752                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2753                                rt_genid(dev_net(dev_out)));
2754                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2755         }
2756
2757 out:
2758         rcu_read_unlock();
2759         return rth;
2760 }
2761
2762 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2763 {
2764         struct rtable *rth;
2765         unsigned int hash;
2766
2767         if (!rt_caching(net))
2768                 goto slow_output;
2769
2770         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2771
2772         rcu_read_lock_bh();
2773         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2774                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2775                 if (rth->rt_key_dst == flp4->daddr &&
2776                     rth->rt_key_src == flp4->saddr &&
2777                     rt_is_output_route(rth) &&
2778                     rth->rt_oif == flp4->flowi4_oif &&
2779                     rth->rt_mark == flp4->flowi4_mark &&
2780                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2781                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2782                     net_eq(dev_net(rth->dst.dev), net) &&
2783                     !rt_is_expired(rth)) {
2784                         dst_use(&rth->dst, jiffies);
2785                         RT_CACHE_STAT_INC(out_hit);
2786                         rcu_read_unlock_bh();
2787                         if (!flp4->saddr)
2788                                 flp4->saddr = rth->rt_src;
2789                         if (!flp4->daddr)
2790                                 flp4->daddr = rth->rt_dst;
2791                         return rth;
2792                 }
2793                 RT_CACHE_STAT_INC(out_hlist_search);
2794         }
2795         rcu_read_unlock_bh();
2796
2797 slow_output:
2798         return ip_route_output_slow(net, flp4);
2799 }
2800 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2801
2802 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2803 {
2804         return NULL;
2805 }
2806
2807 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2808 {
2809         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2810
2811         return mtu ? : dst->dev->mtu;
2812 }
2813
2814 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2815                                           struct sk_buff *skb, u32 mtu)
2816 {
2817 }
2818
2819 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2820                                        struct sk_buff *skb)
2821 {
2822 }
2823
2824 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2825                                           unsigned long old)
2826 {
2827         return NULL;
2828 }
2829
2830 static struct dst_ops ipv4_dst_blackhole_ops = {
2831         .family                 =       AF_INET,
2832         .protocol               =       cpu_to_be16(ETH_P_IP),
2833         .destroy                =       ipv4_dst_destroy,
2834         .check                  =       ipv4_blackhole_dst_check,
2835         .mtu                    =       ipv4_blackhole_mtu,
2836         .default_advmss         =       ipv4_default_advmss,
2837         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2838         .redirect               =       ipv4_rt_blackhole_redirect,
2839         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2840         .neigh_lookup           =       ipv4_neigh_lookup,
2841 };
2842
2843 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2844 {
2845         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2846         struct rtable *ort = (struct rtable *) dst_orig;
2847
2848         if (rt) {
2849                 struct dst_entry *new = &rt->dst;
2850
2851                 new->__use = 1;
2852                 new->input = dst_discard;
2853                 new->output = dst_discard;
2854
2855                 new->dev = ort->dst.dev;
2856                 if (new->dev)
2857                         dev_hold(new->dev);
2858
2859                 rt->rt_key_dst = ort->rt_key_dst;
2860                 rt->rt_key_src = ort->rt_key_src;
2861                 rt->rt_key_tos = ort->rt_key_tos;
2862                 rt->rt_route_iif = ort->rt_route_iif;
2863                 rt->rt_iif = ort->rt_iif;
2864                 rt->rt_oif = ort->rt_oif;
2865                 rt->rt_mark = ort->rt_mark;
2866                 rt->rt_pmtu = ort->rt_pmtu;
2867
2868                 rt->rt_genid = rt_genid(net);
2869                 rt->rt_flags = ort->rt_flags;
2870                 rt->rt_type = ort->rt_type;
2871                 rt->rt_dst = ort->rt_dst;
2872                 rt->rt_src = ort->rt_src;
2873                 rt->rt_gateway = ort->rt_gateway;
2874                 rt->fi = ort->fi;
2875                 if (rt->fi)
2876                         atomic_inc(&rt->fi->fib_clntref);
2877
2878                 dst_free(new);
2879         }
2880
2881         dst_release(dst_orig);
2882
2883         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2884 }
2885
2886 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2887                                     struct sock *sk)
2888 {
2889         struct rtable *rt = __ip_route_output_key(net, flp4);
2890
2891         if (IS_ERR(rt))
2892                 return rt;
2893
2894         if (flp4->flowi4_proto)
2895                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2896                                                    flowi4_to_flowi(flp4),
2897                                                    sk, 0);
2898
2899         return rt;
2900 }
2901 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2902
2903 static int rt_fill_info(struct net *net,
2904                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2905                         int nowait, unsigned int flags)
2906 {
2907         struct rtable *rt = skb_rtable(skb);
2908         struct rtmsg *r;
2909         struct nlmsghdr *nlh;
2910         unsigned long expires = 0;
2911         u32 error;
2912
2913         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2914         if (nlh == NULL)
2915                 return -EMSGSIZE;
2916
2917         r = nlmsg_data(nlh);
2918         r->rtm_family    = AF_INET;
2919         r->rtm_dst_len  = 32;
2920         r->rtm_src_len  = 0;
2921         r->rtm_tos      = rt->rt_key_tos;
2922         r->rtm_table    = RT_TABLE_MAIN;
2923         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2924                 goto nla_put_failure;
2925         r->rtm_type     = rt->rt_type;
2926         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2927         r->rtm_protocol = RTPROT_UNSPEC;
2928         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2929         if (rt->rt_flags & RTCF_NOTIFY)
2930                 r->rtm_flags |= RTM_F_NOTIFY;
2931
2932         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2933                 goto nla_put_failure;
2934         if (rt->rt_key_src) {
2935                 r->rtm_src_len = 32;
2936                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2937                         goto nla_put_failure;
2938         }
2939         if (rt->dst.dev &&
2940             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2941                 goto nla_put_failure;
2942 #ifdef CONFIG_IP_ROUTE_CLASSID
2943         if (rt->dst.tclassid &&
2944             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2945                 goto nla_put_failure;
2946 #endif
2947         if (!rt_is_input_route(rt) &&
2948             rt->rt_src != rt->rt_key_src) {
2949                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2950                         goto nla_put_failure;
2951         }
2952         if (rt->rt_dst != rt->rt_gateway &&
2953             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2954                 goto nla_put_failure;
2955
2956         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2957                 goto nla_put_failure;
2958
2959         if (rt->rt_mark &&
2960             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2961                 goto nla_put_failure;
2962
2963         error = rt->dst.error;
2964         expires = rt->dst.expires;
2965         if (expires) {
2966                 if (time_before(jiffies, expires))
2967                         expires -= jiffies;
2968                 else
2969                         expires = 0;
2970         }
2971
2972         if (rt_is_input_route(rt)) {
2973 #ifdef CONFIG_IP_MROUTE
2974                 __be32 dst = rt->rt_dst;
2975
2976                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2977                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2978                         int err = ipmr_get_route(net, skb,
2979                                                  rt->rt_src, rt->rt_dst,
2980                                                  r, nowait);
2981                         if (err <= 0) {
2982                                 if (!nowait) {
2983                                         if (err == 0)
2984                                                 return 0;
2985                                         goto nla_put_failure;
2986                                 } else {
2987                                         if (err == -EMSGSIZE)
2988                                                 goto nla_put_failure;
2989                                         error = err;
2990                                 }
2991                         }
2992                 } else
2993 #endif
2994                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2995                                 goto nla_put_failure;
2996         }
2997
2998         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2999                 goto nla_put_failure;
3000
3001         return nlmsg_end(skb, nlh);
3002
3003 nla_put_failure:
3004         nlmsg_cancel(skb, nlh);
3005         return -EMSGSIZE;
3006 }
3007
3008 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3009 {
3010         struct net *net = sock_net(in_skb->sk);
3011         struct rtmsg *rtm;
3012         struct nlattr *tb[RTA_MAX+1];
3013         struct rtable *rt = NULL;
3014         __be32 dst = 0;
3015         __be32 src = 0;
3016         u32 iif;
3017         int err;
3018         int mark;
3019         struct sk_buff *skb;
3020
3021         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3022         if (err < 0)
3023                 goto errout;
3024
3025         rtm = nlmsg_data(nlh);
3026
3027         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3028         if (skb == NULL) {
3029                 err = -ENOBUFS;
3030                 goto errout;
3031         }
3032
3033         /* Reserve room for dummy headers, this skb can pass
3034            through good chunk of routing engine.
3035          */
3036         skb_reset_mac_header(skb);
3037         skb_reset_network_header(skb);
3038
3039         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3040         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3041         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3042
3043         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3044         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3045         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3046         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3047
3048         if (iif) {
3049                 struct net_device *dev;
3050
3051                 dev = __dev_get_by_index(net, iif);
3052                 if (dev == NULL) {
3053                         err = -ENODEV;
3054                         goto errout_free;
3055                 }
3056
3057                 skb->protocol   = htons(ETH_P_IP);
3058                 skb->dev        = dev;
3059                 skb->mark       = mark;
3060                 local_bh_disable();
3061                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3062                 local_bh_enable();
3063
3064                 rt = skb_rtable(skb);
3065                 if (err == 0 && rt->dst.error)
3066                         err = -rt->dst.error;
3067         } else {
3068                 struct flowi4 fl4 = {
3069                         .daddr = dst,
3070                         .saddr = src,
3071                         .flowi4_tos = rtm->rtm_tos,
3072                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3073                         .flowi4_mark = mark,
3074                 };
3075                 rt = ip_route_output_key(net, &fl4);
3076
3077                 err = 0;
3078                 if (IS_ERR(rt))
3079                         err = PTR_ERR(rt);
3080         }
3081
3082         if (err)
3083                 goto errout_free;
3084
3085         skb_dst_set(skb, &rt->dst);
3086         if (rtm->rtm_flags & RTM_F_NOTIFY)
3087                 rt->rt_flags |= RTCF_NOTIFY;
3088
3089         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3090                            RTM_NEWROUTE, 0, 0);
3091         if (err <= 0)
3092                 goto errout_free;
3093
3094         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3095 errout:
3096         return err;
3097
3098 errout_free:
3099         kfree_skb(skb);
3100         goto errout;
3101 }
3102
3103 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3104 {
3105         struct rtable *rt;
3106         int h, s_h;
3107         int idx, s_idx;
3108         struct net *net;
3109
3110         net = sock_net(skb->sk);
3111
3112         s_h = cb->args[0];
3113         if (s_h < 0)
3114                 s_h = 0;
3115         s_idx = idx = cb->args[1];
3116         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3117                 if (!rt_hash_table[h].chain)
3118                         continue;
3119                 rcu_read_lock_bh();
3120                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3121                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3122                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3123                                 continue;
3124                         if (rt_is_expired(rt))
3125                                 continue;
3126                         skb_dst_set_noref(skb, &rt->dst);
3127                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3128                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3129                                          1, NLM_F_MULTI) <= 0) {
3130                                 skb_dst_drop(skb);
3131                                 rcu_read_unlock_bh();
3132                                 goto done;
3133                         }
3134                         skb_dst_drop(skb);
3135                 }
3136                 rcu_read_unlock_bh();
3137         }
3138
3139 done:
3140         cb->args[0] = h;
3141         cb->args[1] = idx;
3142         return skb->len;
3143 }
3144
3145 void ip_rt_multicast_event(struct in_device *in_dev)
3146 {
3147         rt_cache_flush(dev_net(in_dev->dev), 0);
3148 }
3149
3150 #ifdef CONFIG_SYSCTL
3151 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3152                                         void __user *buffer,
3153                                         size_t *lenp, loff_t *ppos)
3154 {
3155         if (write) {
3156                 int flush_delay;
3157                 ctl_table ctl;
3158                 struct net *net;
3159
3160                 memcpy(&ctl, __ctl, sizeof(ctl));
3161                 ctl.data = &flush_delay;
3162                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3163
3164                 net = (struct net *)__ctl->extra1;
3165                 rt_cache_flush(net, flush_delay);
3166                 return 0;
3167         }
3168
3169         return -EINVAL;
3170 }
3171
3172 static ctl_table ipv4_route_table[] = {
3173         {
3174                 .procname       = "gc_thresh",
3175                 .data           = &ipv4_dst_ops.gc_thresh,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .procname       = "max_size",
3182                 .data           = &ip_rt_max_size,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 /*  Deprecated. Use gc_min_interval_ms */
3189
3190                 .procname       = "gc_min_interval",
3191                 .data           = &ip_rt_gc_min_interval,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec_jiffies,
3195         },
3196         {
3197                 .procname       = "gc_min_interval_ms",
3198                 .data           = &ip_rt_gc_min_interval,
3199                 .maxlen         = sizeof(int),
3200                 .mode           = 0644,
3201                 .proc_handler   = proc_dointvec_ms_jiffies,
3202         },
3203         {
3204                 .procname       = "gc_timeout",
3205                 .data           = &ip_rt_gc_timeout,
3206                 .maxlen         = sizeof(int),
3207                 .mode           = 0644,
3208                 .proc_handler   = proc_dointvec_jiffies,
3209         },
3210         {
3211                 .procname       = "gc_interval",
3212                 .data           = &ip_rt_gc_interval,
3213                 .maxlen         = sizeof(int),
3214                 .mode           = 0644,
3215                 .proc_handler   = proc_dointvec_jiffies,
3216         },
3217         {
3218                 .procname       = "redirect_load",
3219                 .data           = &ip_rt_redirect_load,
3220                 .maxlen         = sizeof(int),
3221                 .mode           = 0644,
3222                 .proc_handler   = proc_dointvec,
3223         },
3224         {
3225                 .procname       = "redirect_number",
3226                 .data           = &ip_rt_redirect_number,
3227                 .maxlen         = sizeof(int),
3228                 .mode           = 0644,
3229                 .proc_handler   = proc_dointvec,
3230         },
3231         {
3232                 .procname       = "redirect_silence",
3233                 .data           = &ip_rt_redirect_silence,
3234                 .maxlen         = sizeof(int),
3235                 .mode           = 0644,
3236                 .proc_handler   = proc_dointvec,
3237         },
3238         {
3239                 .procname       = "error_cost",
3240                 .data           = &ip_rt_error_cost,
3241                 .maxlen         = sizeof(int),
3242                 .mode           = 0644,
3243                 .proc_handler   = proc_dointvec,
3244         },
3245         {
3246                 .procname       = "error_burst",
3247                 .data           = &ip_rt_error_burst,
3248                 .maxlen         = sizeof(int),
3249                 .mode           = 0644,
3250                 .proc_handler   = proc_dointvec,
3251         },
3252         {
3253                 .procname       = "gc_elasticity",
3254                 .data           = &ip_rt_gc_elasticity,
3255                 .maxlen         = sizeof(int),
3256                 .mode           = 0644,
3257                 .proc_handler   = proc_dointvec,
3258         },
3259         {
3260                 .procname       = "mtu_expires",
3261                 .data           = &ip_rt_mtu_expires,
3262                 .maxlen         = sizeof(int),
3263                 .mode           = 0644,
3264                 .proc_handler   = proc_dointvec_jiffies,
3265         },
3266         {
3267                 .procname       = "min_pmtu",
3268                 .data           = &ip_rt_min_pmtu,
3269                 .maxlen         = sizeof(int),
3270                 .mode           = 0644,
3271                 .proc_handler   = proc_dointvec,
3272         },
3273         {
3274                 .procname       = "min_adv_mss",
3275                 .data           = &ip_rt_min_advmss,
3276                 .maxlen         = sizeof(int),
3277                 .mode           = 0644,
3278                 .proc_handler   = proc_dointvec,
3279         },
3280         { }
3281 };
3282
3283 static struct ctl_table ipv4_route_flush_table[] = {
3284         {
3285                 .procname       = "flush",
3286                 .maxlen         = sizeof(int),
3287                 .mode           = 0200,
3288                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3289         },
3290         { },
3291 };
3292
3293 static __net_init int sysctl_route_net_init(struct net *net)
3294 {
3295         struct ctl_table *tbl;
3296
3297         tbl = ipv4_route_flush_table;
3298         if (!net_eq(net, &init_net)) {
3299                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3300                 if (tbl == NULL)
3301                         goto err_dup;
3302         }
3303         tbl[0].extra1 = net;
3304
3305         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3306         if (net->ipv4.route_hdr == NULL)
3307                 goto err_reg;
3308         return 0;
3309
3310 err_reg:
3311         if (tbl != ipv4_route_flush_table)
3312                 kfree(tbl);
3313 err_dup:
3314         return -ENOMEM;
3315 }
3316
3317 static __net_exit void sysctl_route_net_exit(struct net *net)
3318 {
3319         struct ctl_table *tbl;
3320
3321         tbl = net->ipv4.route_hdr->ctl_table_arg;
3322         unregister_net_sysctl_table(net->ipv4.route_hdr);
3323         BUG_ON(tbl == ipv4_route_flush_table);
3324         kfree(tbl);
3325 }
3326
3327 static __net_initdata struct pernet_operations sysctl_route_ops = {
3328         .init = sysctl_route_net_init,
3329         .exit = sysctl_route_net_exit,
3330 };
3331 #endif
3332
3333 static __net_init int rt_genid_init(struct net *net)
3334 {
3335         get_random_bytes(&net->ipv4.rt_genid,
3336                          sizeof(net->ipv4.rt_genid));
3337         get_random_bytes(&net->ipv4.dev_addr_genid,
3338                          sizeof(net->ipv4.dev_addr_genid));
3339         return 0;
3340 }
3341
3342 static __net_initdata struct pernet_operations rt_genid_ops = {
3343         .init = rt_genid_init,
3344 };
3345
3346 static int __net_init ipv4_inetpeer_init(struct net *net)
3347 {
3348         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3349
3350         if (!bp)
3351                 return -ENOMEM;
3352         inet_peer_base_init(bp);
3353         net->ipv4.peers = bp;
3354         return 0;
3355 }
3356
3357 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3358 {
3359         struct inet_peer_base *bp = net->ipv4.peers;
3360
3361         net->ipv4.peers = NULL;
3362         inetpeer_invalidate_tree(bp);
3363         kfree(bp);
3364 }
3365
3366 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3367         .init   =       ipv4_inetpeer_init,
3368         .exit   =       ipv4_inetpeer_exit,
3369 };
3370
3371 #ifdef CONFIG_IP_ROUTE_CLASSID
3372 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3373 #endif /* CONFIG_IP_ROUTE_CLASSID */
3374
3375 static __initdata unsigned long rhash_entries;
3376 static int __init set_rhash_entries(char *str)
3377 {
3378         ssize_t ret;
3379
3380         if (!str)
3381                 return 0;
3382
3383         ret = kstrtoul(str, 0, &rhash_entries);
3384         if (ret)
3385                 return 0;
3386
3387         return 1;
3388 }
3389 __setup("rhash_entries=", set_rhash_entries);
3390
3391 int __init ip_rt_init(void)
3392 {
3393         int rc = 0;
3394
3395 #ifdef CONFIG_IP_ROUTE_CLASSID
3396         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3397         if (!ip_rt_acct)
3398                 panic("IP: failed to allocate ip_rt_acct\n");
3399 #endif
3400
3401         ipv4_dst_ops.kmem_cachep =
3402                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3403                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3404
3405         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3406
3407         if (dst_entries_init(&ipv4_dst_ops) < 0)
3408                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3409
3410         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3411                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3412
3413         rt_hash_table = (struct rt_hash_bucket *)
3414                 alloc_large_system_hash("IP route cache",
3415                                         sizeof(struct rt_hash_bucket),
3416                                         rhash_entries,
3417                                         (totalram_pages >= 128 * 1024) ?
3418                                         15 : 17,
3419                                         0,
3420                                         &rt_hash_log,
3421                                         &rt_hash_mask,
3422                                         0,
3423                                         rhash_entries ? 0 : 512 * 1024);
3424         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3425         rt_hash_lock_init();
3426
3427         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3428         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3429
3430         devinet_init();
3431         ip_fib_init();
3432
3433         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3434         expires_ljiffies = jiffies;
3435         schedule_delayed_work(&expires_work,
3436                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3437
3438         if (ip_rt_proc_init())
3439                 pr_err("Unable to create route proc files\n");
3440 #ifdef CONFIG_XFRM
3441         xfrm_init();
3442         xfrm4_init(ip_rt_max_size);
3443 #endif
3444         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3445
3446 #ifdef CONFIG_SYSCTL
3447         register_pernet_subsys(&sysctl_route_ops);
3448 #endif
3449         register_pernet_subsys(&rt_genid_ops);
3450         register_pernet_subsys(&ipv4_inetpeer_ops);
3451         return rc;
3452 }
3453
3454 #ifdef CONFIG_SYSCTL
3455 /*
3456  * We really need to sanitize the damn ipv4 init order, then all
3457  * this nonsense will go away.
3458  */
3459 void __init ip_static_sysctl_init(void)
3460 {
3461         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3462 }
3463 #endif