net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         peer = rt_get_peer_create(rt, rt->rt_dst);
 166         if (peer) {
 167                 u32 *old_p = __DST_METRICS_PTR(old);
 168                 unsigned long prev, new;
 169
 170                 p = peer->metrics;
 171                 if (inet_metrics_new(peer))
 172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 173
 174                 new = (unsigned long) p;
 175                 prev = cmpxchg(&dst->_metrics, old, new);
 176
 177                 if (prev != old) {
 178                         p = __DST_METRICS_PTR(prev);
 179                         if (prev & DST_METRICS_READ_ONLY)
 180                                 p = NULL;
 181                 } else {
 182                         if (rt->fi) {
 183                                 fib_info_put(rt->fi);
 184                                 rt->fi = NULL;
 185                         }
 186                 }
 187         }
 188         return p;
 189 }
 190
 191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 192                                            struct sk_buff *skb,
 193                                            const void *daddr);
 194
 195 static struct dst_ops ipv4_dst_ops = {
 196         .family =               AF_INET,
 197         .protocol =             cpu_to_be16(ETH_P_IP),
 198         .gc =                   rt_garbage_collect,
 199         .check =                ipv4_dst_check,
 200         .default_advmss =       ipv4_default_advmss,
 201         .mtu =                  ipv4_mtu,
 202         .cow_metrics =          ipv4_cow_metrics,
 203         .destroy =              ipv4_dst_destroy,
 204         .ifdown =               ipv4_dst_ifdown,
 205         .negative_advice =      ipv4_negative_advice,
 206         .link_failure =         ipv4_link_failure,
 207         .update_pmtu =          ip_rt_update_pmtu,
 208         .local_out =            __ip_local_out,
 209         .neigh_lookup =         ipv4_neigh_lookup,
 210 };
 211
 212 #define ECN_OR_COST(class)      TC_PRIO_##class
 213
 214 const __u8 ip_tos2prio[16] = {
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BESTEFFORT,
 218         ECN_OR_COST(BESTEFFORT),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_BULK,
 222         ECN_OR_COST(BULK),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE,
 226         ECN_OR_COST(INTERACTIVE),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK),
 229         TC_PRIO_INTERACTIVE_BULK,
 230         ECN_OR_COST(INTERACTIVE_BULK)
 231 };
 232 EXPORT_SYMBOL(ip_tos2prio);
 233
 234 /*
 235  * Route cache.
 236  */
 237
 238 /* The locking scheme is rather straight forward:
 239  *
 240  * 1) Read-Copy Update protects the buckets of the central route hash.
 241  * 2) Only writers remove entries, and they hold the lock
 242  *    as they look at rtable reference counts.
 243  * 3) Only readers acquire references to rtable entries,
 244  *    they do so with atomic increments and with the
 245  *    lock held.
 246  */
 247
 248 struct rt_hash_bucket {
 249         struct rtable __rcu     *chain;
 250 };
 251
 252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 253         defined(CONFIG_PROVE_LOCKING)
 254 /*
 255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 256  * The size of this table is a power of two and depends on the number of CPUS.
 257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 258  */
 259 #ifdef CONFIG_LOCKDEP
 260 # define RT_HASH_LOCK_SZ        256
 261 #else
 262 # if NR_CPUS >= 32
 263 #  define RT_HASH_LOCK_SZ       4096
 264 # elif NR_CPUS >= 16
 265 #  define RT_HASH_LOCK_SZ       2048
 266 # elif NR_CPUS >= 8
 267 #  define RT_HASH_LOCK_SZ       1024
 268 # elif NR_CPUS >= 4
 269 #  define RT_HASH_LOCK_SZ       512
 270 # else
 271 #  define RT_HASH_LOCK_SZ       256
 272 # endif
 273 #endif
 274
 275 static spinlock_t       *rt_hash_locks;
 276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 277
 278 static __init void rt_hash_lock_init(void)
 279 {
 280         int i;
 281
 282         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 283                         GFP_KERNEL);
 284         if (!rt_hash_locks)
 285                 panic("IP: failed to allocate rt_hash_locks\n");
 286
 287         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 288                 spin_lock_init(&rt_hash_locks[i]);
 289 }
 290 #else
 291 # define rt_hash_lock_addr(slot) NULL
 292
 293 static inline void rt_hash_lock_init(void)
 294 {
 295 }
 296 #endif
 297
 298 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 299 static unsigned int             rt_hash_mask __read_mostly;
 300 static unsigned int             rt_hash_log  __read_mostly;
 301
 302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 304
 305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 306                                    int genid)
 307 {
 308         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 309                             idx, genid)
 310                 & rt_hash_mask;
 311 }
 312
 313 static inline int rt_genid(struct net *net)
 314 {
 315         return atomic_read(&net->ipv4.rt_genid);
 316 }
 317
 318 #ifdef CONFIG_PROC_FS
 319 struct rt_cache_iter_state {
 320         struct seq_net_private p;
 321         int bucket;
 322         int genid;
 323 };
 324
 325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 326 {
 327         struct rt_cache_iter_state *st = seq->private;
 328         struct rtable *r = NULL;
 329
 330         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 331                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 332                         continue;
 333                 rcu_read_lock_bh();
 334                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 335                 while (r) {
 336                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 337                             r->rt_genid == st->genid)
 338                                 return r;
 339                         r = rcu_dereference_bh(r->dst.rt_next);
 340                 }
 341                 rcu_read_unlock_bh();
 342         }
 343         return r;
 344 }
 345
 346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 347                                           struct rtable *r)
 348 {
 349         struct rt_cache_iter_state *st = seq->private;
 350
 351         r = rcu_dereference_bh(r->dst.rt_next);
 352         while (!r) {
 353                 rcu_read_unlock_bh();
 354                 do {
 355                         if (--st->bucket < 0)
 356                                 return NULL;
 357                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 358                 rcu_read_lock_bh();
 359                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 360         }
 361         return r;
 362 }
 363
 364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 365                                         struct rtable *r)
 366 {
 367         struct rt_cache_iter_state *st = seq->private;
 368         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 369                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 370                         continue;
 371                 if (r->rt_genid == st->genid)
 372                         break;
 373         }
 374         return r;
 375 }
 376
 377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 378 {
 379         struct rtable *r = rt_cache_get_first(seq);
 380
 381         if (r)
 382                 while (pos && (r = rt_cache_get_next(seq, r)))
 383                         --pos;
 384         return pos ? NULL : r;
 385 }
 386
 387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 388 {
 389         struct rt_cache_iter_state *st = seq->private;
 390         if (*pos)
 391                 return rt_cache_get_idx(seq, *pos - 1);
 392         st->genid = rt_genid(seq_file_net(seq));
 393         return SEQ_START_TOKEN;
 394 }
 395
 396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 397 {
 398         struct rtable *r;
 399
 400         if (v == SEQ_START_TOKEN)
 401                 r = rt_cache_get_first(seq);
 402         else
 403                 r = rt_cache_get_next(seq, v);
 404         ++*pos;
 405         return r;
 406 }
 407
 408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 409 {
 410         if (v && v != SEQ_START_TOKEN)
 411                 rcu_read_unlock_bh();
 412 }
 413
 414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 415 {
 416         if (v == SEQ_START_TOKEN)
 417                 seq_printf(seq, "%-127s\n",
 418                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 419                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 420                            "HHUptod\tSpecDst");
 421         else {
 422                 struct rtable *r = v;
 423                 int len;
 424
 425                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 426                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 427                         r->dst.dev ? r->dst.dev->name : "*",
 428                         (__force u32)r->rt_dst,
 429                         (__force u32)r->rt_gateway,
 430                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 431                         r->dst.__use, 0, (__force u32)r->rt_src,
 432                         dst_metric_advmss(&r->dst) + 40,
 433                         dst_metric(&r->dst, RTAX_WINDOW),
 434                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 435                               dst_metric(&r->dst, RTAX_RTTVAR)),
 436                         r->rt_key_tos,
 437                         -1, 0, 0, &len);
 438
 439                 seq_printf(seq, "%*s\n", 127 - len, "");
 440         }
 441         return 0;
 442 }
 443
 444 static const struct seq_operations rt_cache_seq_ops = {
 445         .start  = rt_cache_seq_start,
 446         .next   = rt_cache_seq_next,
 447         .stop   = rt_cache_seq_stop,
 448         .show   = rt_cache_seq_show,
 449 };
 450
 451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 452 {
 453         return seq_open_net(inode, file, &rt_cache_seq_ops,
 454                         sizeof(struct rt_cache_iter_state));
 455 }
 456
 457 static const struct file_operations rt_cache_seq_fops = {
 458         .owner   = THIS_MODULE,
 459         .open    = rt_cache_seq_open,
 460         .read    = seq_read,
 461         .llseek  = seq_lseek,
 462         .release = seq_release_net,
 463 };
 464
 465
 466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 467 {
 468         int cpu;
 469
 470         if (*pos == 0)
 471                 return SEQ_START_TOKEN;
 472
 473         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 474                 if (!cpu_possible(cpu))
 475                         continue;
 476                 *pos = cpu+1;
 477                 return &per_cpu(rt_cache_stat, cpu);
 478         }
 479         return NULL;
 480 }
 481
 482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 483 {
 484         int cpu;
 485
 486         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 487                 if (!cpu_possible(cpu))
 488                         continue;
 489                 *pos = cpu+1;
 490                 return &per_cpu(rt_cache_stat, cpu);
 491         }
 492         return NULL;
 493
 494 }
 495
 496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 497 {
 498
 499 }
 500
 501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 502 {
 503         struct rt_cache_stat *st = v;
 504
 505         if (v == SEQ_START_TOKEN) {
 506                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 507                 return 0;
 508         }
 509
 510         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 511                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 512                    dst_entries_get_slow(&ipv4_dst_ops),
 513                    st->in_hit,
 514                    st->in_slow_tot,
 515                    st->in_slow_mc,
 516                    st->in_no_route,
 517                    st->in_brd,
 518                    st->in_martian_dst,
 519                    st->in_martian_src,
 520
 521                    st->out_hit,
 522                    st->out_slow_tot,
 523                    st->out_slow_mc,
 524
 525                    st->gc_total,
 526                    st->gc_ignored,
 527                    st->gc_goal_miss,
 528                    st->gc_dst_overflow,
 529                    st->in_hlist_search,
 530                    st->out_hlist_search
 531                 );
 532         return 0;
 533 }
 534
 535 static const struct seq_operations rt_cpu_seq_ops = {
 536         .start  = rt_cpu_seq_start,
 537         .next   = rt_cpu_seq_next,
 538         .stop   = rt_cpu_seq_stop,
 539         .show   = rt_cpu_seq_show,
 540 };
 541
 542
 543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 544 {
 545         return seq_open(file, &rt_cpu_seq_ops);
 546 }
 547
 548 static const struct file_operations rt_cpu_seq_fops = {
 549         .owner   = THIS_MODULE,
 550         .open    = rt_cpu_seq_open,
 551         .read    = seq_read,
 552         .llseek  = seq_lseek,
 553         .release = seq_release,
 554 };
 555
 556 #ifdef CONFIG_IP_ROUTE_CLASSID
 557 static int rt_acct_proc_show(struct seq_file *m, void *v)
 558 {
 559         struct ip_rt_acct *dst, *src;
 560         unsigned int i, j;
 561
 562         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 563         if (!dst)
 564                 return -ENOMEM;
 565
 566         for_each_possible_cpu(i) {
 567                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 568                 for (j = 0; j < 256; j++) {
 569                         dst[j].o_bytes   += src[j].o_bytes;
 570                         dst[j].o_packets += src[j].o_packets;
 571                         dst[j].i_bytes   += src[j].i_bytes;
 572                         dst[j].i_packets += src[j].i_packets;
 573                 }
 574         }
 575
 576         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 577         kfree(dst);
 578         return 0;
 579 }
 580
 581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 582 {
 583         return single_open(file, rt_acct_proc_show, NULL);
 584 }
 585
 586 static const struct file_operations rt_acct_proc_fops = {
 587         .owner          = THIS_MODULE,
 588         .open           = rt_acct_proc_open,
 589         .read           = seq_read,
 590         .llseek         = seq_lseek,
 591         .release        = single_release,
 592 };
 593 #endif
 594
 595 static int __net_init ip_rt_do_proc_init(struct net *net)
 596 {
 597         struct proc_dir_entry *pde;
 598
 599         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 600                         &rt_cache_seq_fops);
 601         if (!pde)
 602                 goto err1;
 603
 604         pde = proc_create("rt_cache", S_IRUGO,
 605                           net->proc_net_stat, &rt_cpu_seq_fops);
 606         if (!pde)
 607                 goto err2;
 608
 609 #ifdef CONFIG_IP_ROUTE_CLASSID
 610         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 611         if (!pde)
 612                 goto err3;
 613 #endif
 614         return 0;
 615
 616 #ifdef CONFIG_IP_ROUTE_CLASSID
 617 err3:
 618         remove_proc_entry("rt_cache", net->proc_net_stat);
 619 #endif
 620 err2:
 621         remove_proc_entry("rt_cache", net->proc_net);
 622 err1:
 623         return -ENOMEM;
 624 }
 625
 626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 627 {
 628         remove_proc_entry("rt_cache", net->proc_net_stat);
 629         remove_proc_entry("rt_cache", net->proc_net);
 630 #ifdef CONFIG_IP_ROUTE_CLASSID
 631         remove_proc_entry("rt_acct", net->proc_net);
 632 #endif
 633 }
 634
 635 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 636         .init = ip_rt_do_proc_init,
 637         .exit = ip_rt_do_proc_exit,
 638 };
 639
 640 static int __init ip_rt_proc_init(void)
 641 {
 642         return register_pernet_subsys(&ip_rt_proc_ops);
 643 }
 644
 645 #else
 646 static inline int ip_rt_proc_init(void)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_PROC_FS */
 651
 652 static inline void rt_free(struct rtable *rt)
 653 {
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline void rt_drop(struct rtable *rt)
 658 {
 659         ip_rt_put(rt);
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline int rt_fast_clean(struct rtable *rth)
 664 {
 665         /* Kill broadcast/multicast entries very aggresively, if they
 666            collide in hash table with more useful entries */
 667         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 668                 rt_is_input_route(rth) && rth->dst.rt_next;
 669 }
 670
 671 static inline int rt_valuable(struct rtable *rth)
 672 {
 673         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 674                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 675 }
 676
 677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 678 {
 679         unsigned long age;
 680         int ret = 0;
 681
 682         if (atomic_read(&rth->dst.__refcnt))
 683                 goto out;
 684
 685         age = jiffies - rth->dst.lastuse;
 686         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687             (age <= tmo2 && rt_valuable(rth)))
 688                 goto out;
 689         ret = 1;
 690 out:    return ret;
 691 }
 692
 693 /* Bits of score are:
 694  * 31: very valuable
 695  * 30: not quite useless
 696  * 29..0: usage counter
 697  */
 698 static inline u32 rt_score(struct rtable *rt)
 699 {
 700         u32 score = jiffies - rt->dst.lastuse;
 701
 702         score = ~score & ~(3<<30);
 703
 704         if (rt_valuable(rt))
 705                 score |= (1<<31);
 706
 707         if (rt_is_output_route(rt) ||
 708             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709                 score |= (1<<30);
 710
 711         return score;
 712 }
 713
 714 static inline bool rt_caching(const struct net *net)
 715 {
 716         return net->ipv4.current_rt_cache_rebuild_count <=
 717                 net->ipv4.sysctl_rt_cache_rebuild_count;
 718 }
 719
 720 static inline bool compare_hash_inputs(const struct rtable *rt1,
 721                                        const struct rtable *rt2)
 722 {
 723         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 724                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 725                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 726 }
 727
 728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 729 {
 730         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                 (rt1->rt_mark ^ rt2->rt_mark) |
 733                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 735                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 736 }
 737
 738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739 {
 740         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 741 }
 742
 743 static inline int rt_is_expired(struct rtable *rth)
 744 {
 745         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 746 }
 747
 748 /*
 749  * Perform a full scan of hash table and free all entries.
 750  * Can be called by a softirq or a process.
 751  * In the later case, we want to be reschedule if necessary
 752  */
 753 static void rt_do_flush(struct net *net, int process_context)
 754 {
 755         unsigned int i;
 756         struct rtable *rth, *next;
 757
 758         for (i = 0; i <= rt_hash_mask; i++) {
 759                 struct rtable __rcu **pprev;
 760                 struct rtable *list;
 761
 762                 if (process_context && need_resched())
 763                         cond_resched();
 764                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 765                 if (!rth)
 766                         continue;
 767
 768                 spin_lock_bh(rt_hash_lock_addr(i));
 769
 770                 list = NULL;
 771                 pprev = &rt_hash_table[i].chain;
 772                 rth = rcu_dereference_protected(*pprev,
 773                         lockdep_is_held(rt_hash_lock_addr(i)));
 774
 775                 while (rth) {
 776                         next = rcu_dereference_protected(rth->dst.rt_next,
 777                                 lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                         if (!net ||
 780                             net_eq(dev_net(rth->dst.dev), net)) {
 781                                 rcu_assign_pointer(*pprev, next);
 782                                 rcu_assign_pointer(rth->dst.rt_next, list);
 783                                 list = rth;
 784                         } else {
 785                                 pprev = &rth->dst.rt_next;
 786                         }
 787                         rth = next;
 788                 }
 789
 790                 spin_unlock_bh(rt_hash_lock_addr(i));
 791
 792                 for (; list; list = next) {
 793                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 794                         rt_free(list);
 795                 }
 796         }
 797 }
 798
 799 /*
 800  * While freeing expired entries, we compute average chain length
 801  * and standard deviation, using fixed-point arithmetic.
 802  * This to have an estimation of rt_chain_length_max
 803  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 804  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 805  */
 806
 807 #define FRACT_BITS 3
 808 #define ONE (1UL << FRACT_BITS)
 809
 810 /*
 811  * Given a hash chain and an item in this hash chain,
 812  * find if a previous entry has the same hash_inputs
 813  * (but differs on tos, mark or oif)
 814  * Returns 0 if an alias is found.
 815  * Returns ONE if rth has no alias before itself.
 816  */
 817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 818 {
 819         const struct rtable *aux = head;
 820
 821         while (aux != rth) {
 822                 if (compare_hash_inputs(aux, rth))
 823                         return 0;
 824                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 825         }
 826         return ONE;
 827 }
 828
 829 static void rt_check_expire(void)
 830 {
 831         static unsigned int rover;
 832         unsigned int i = rover, goal;
 833         struct rtable *rth;
 834         struct rtable __rcu **rthp;
 835         unsigned long samples = 0;
 836         unsigned long sum = 0, sum2 = 0;
 837         unsigned long delta;
 838         u64 mult;
 839
 840         delta = jiffies - expires_ljiffies;
 841         expires_ljiffies = jiffies;
 842         mult = ((u64)delta) << rt_hash_log;
 843         if (ip_rt_gc_timeout > 1)
 844                 do_div(mult, ip_rt_gc_timeout);
 845         goal = (unsigned int)mult;
 846         if (goal > rt_hash_mask)
 847                 goal = rt_hash_mask + 1;
 848         for (; goal > 0; goal--) {
 849                 unsigned long tmo = ip_rt_gc_timeout;
 850                 unsigned long length;
 851
 852                 i = (i + 1) & rt_hash_mask;
 853                 rthp = &rt_hash_table[i].chain;
 854
 855                 if (need_resched())
 856                         cond_resched();
 857
 858                 samples++;
 859
 860                 if (rcu_dereference_raw(*rthp) == NULL)
 861                         continue;
 862                 length = 0;
 863                 spin_lock_bh(rt_hash_lock_addr(i));
 864                 while ((rth = rcu_dereference_protected(*rthp,
 865                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 866                         prefetch(rth->dst.rt_next);
 867                         if (rt_is_expired(rth) ||
 868                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 869                                 *rthp = rth->dst.rt_next;
 870                                 rt_free(rth);
 871                                 continue;
 872                         }
 873
 874                         /* We only count entries on a chain with equal
 875                          * hash inputs once so that entries for
 876                          * different QOS levels, and other non-hash
 877                          * input attributes don't unfairly skew the
 878                          * length computation
 879                          */
 880                         tmo >>= 1;
 881                         rthp = &rth->dst.rt_next;
 882                         length += has_noalias(rt_hash_table[i].chain, rth);
 883                 }
 884                 spin_unlock_bh(rt_hash_lock_addr(i));
 885                 sum += length;
 886                 sum2 += length*length;
 887         }
 888         if (samples) {
 889                 unsigned long avg = sum / samples;
 890                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 891                 rt_chain_length_max = max_t(unsigned long,
 892                                         ip_rt_gc_elasticity,
 893                                         (avg + 4*sd) >> FRACT_BITS);
 894         }
 895         rover = i;
 896 }
 897
 898 /*
 899  * rt_worker_func() is run in process context.
 900  * we call rt_check_expire() to scan part of the hash table
 901  */
 902 static void rt_worker_func(struct work_struct *work)
 903 {
 904         rt_check_expire();
 905         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 906 }
 907
 908 /*
 909  * Perturbation of rt_genid by a small quantity [1..256]
 910  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 911  * many times (2^24) without giving recent rt_genid.
 912  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 913  */
 914 static void rt_cache_invalidate(struct net *net)
 915 {
 916         unsigned char shuffle;
 917
 918         get_random_bytes(&shuffle, sizeof(shuffle));
 919         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 920         inetpeer_invalidate_family(AF_INET);
 921 }
 922
 923 /*
 924  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 925  * delay >= 0 : invalidate & flush cache (can be long)
 926  */
 927 void rt_cache_flush(struct net *net, int delay)
 928 {
 929         rt_cache_invalidate(net);
 930         if (delay >= 0)
 931                 rt_do_flush(net, !in_softirq());
 932 }
 933
 934 /* Flush previous cache invalidated entries from the cache */
 935 void rt_cache_flush_batch(struct net *net)
 936 {
 937         rt_do_flush(net, !in_softirq());
 938 }
 939
 940 static void rt_emergency_hash_rebuild(struct net *net)
 941 {
 942         net_warn_ratelimited("Route hash chain too long!\n");
 943         rt_cache_invalidate(net);
 944 }
 945
 946 /*
 947    Short description of GC goals.
 948
 949    We want to build algorithm, which will keep routing cache
 950    at some equilibrium point, when number of aged off entries
 951    is kept approximately equal to newly generated ones.
 952
 953    Current expiration strength is variable "expire".
 954    We try to adjust it dynamically, so that if networking
 955    is idle expires is large enough to keep enough of warm entries,
 956    and when load increases it reduces to limit cache size.
 957  */
 958
 959 static int rt_garbage_collect(struct dst_ops *ops)
 960 {
 961         static unsigned long expire = RT_GC_TIMEOUT;
 962         static unsigned long last_gc;
 963         static int rover;
 964         static int equilibrium;
 965         struct rtable *rth;
 966         struct rtable __rcu **rthp;
 967         unsigned long now = jiffies;
 968         int goal;
 969         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 970
 971         /*
 972          * Garbage collection is pretty expensive,
 973          * do not make it too frequently.
 974          */
 975
 976         RT_CACHE_STAT_INC(gc_total);
 977
 978         if (now - last_gc < ip_rt_gc_min_interval &&
 979             entries < ip_rt_max_size) {
 980                 RT_CACHE_STAT_INC(gc_ignored);
 981                 goto out;
 982         }
 983
 984         entries = dst_entries_get_slow(&ipv4_dst_ops);
 985         /* Calculate number of entries, which we want to expire now. */
 986         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 987         if (goal <= 0) {
 988                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 989                         equilibrium = ipv4_dst_ops.gc_thresh;
 990                 goal = entries - equilibrium;
 991                 if (goal > 0) {
 992                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 993                         goal = entries - equilibrium;
 994                 }
 995         } else {
 996                 /* We are in dangerous area. Try to reduce cache really
 997                  * aggressively.
 998                  */
 999                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1000                 equilibrium = entries - goal;
1001         }
1002
1003         if (now - last_gc >= ip_rt_gc_min_interval)
1004                 last_gc = now;
1005
1006         if (goal <= 0) {
1007                 equilibrium += goal;
1008                 goto work_done;
1009         }
1010
1011         do {
1012                 int i, k;
1013
1014                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1015                         unsigned long tmo = expire;
1016
1017                         k = (k + 1) & rt_hash_mask;
1018                         rthp = &rt_hash_table[k].chain;
1019                         spin_lock_bh(rt_hash_lock_addr(k));
1020                         while ((rth = rcu_dereference_protected(*rthp,
1021                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1022                                 if (!rt_is_expired(rth) &&
1023                                         !rt_may_expire(rth, tmo, expire)) {
1024                                         tmo >>= 1;
1025                                         rthp = &rth->dst.rt_next;
1026                                         continue;
1027                                 }
1028                                 *rthp = rth->dst.rt_next;
1029                                 rt_free(rth);
1030                                 goal--;
1031                         }
1032                         spin_unlock_bh(rt_hash_lock_addr(k));
1033                         if (goal <= 0)
1034                                 break;
1035                 }
1036                 rover = k;
1037
1038                 if (goal <= 0)
1039                         goto work_done;
1040
1041                 /* Goal is not achieved. We stop process if:
1042
1043                    - if expire reduced to zero. Otherwise, expire is halfed.
1044                    - if table is not full.
1045                    - if we are called from interrupt.
1046                    - jiffies check is just fallback/debug loop breaker.
1047                      We will not spin here for long time in any case.
1048                  */
1049
1050                 RT_CACHE_STAT_INC(gc_goal_miss);
1051
1052                 if (expire == 0)
1053                         break;
1054
1055                 expire >>= 1;
1056
1057                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1058                         goto out;
1059         } while (!in_softirq() && time_before_eq(jiffies, now));
1060
1061         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1062                 goto out;
1063         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1064                 goto out;
1065         net_warn_ratelimited("dst cache overflow\n");
1066         RT_CACHE_STAT_INC(gc_dst_overflow);
1067         return 1;
1068
1069 work_done:
1070         expire += ip_rt_gc_min_interval;
1071         if (expire > ip_rt_gc_timeout ||
1072             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1073             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1074                 expire = ip_rt_gc_timeout;
1075 out:    return 0;
1076 }
1077
1078 /*
1079  * Returns number of entries in a hash chain that have different hash_inputs
1080  */
1081 static int slow_chain_length(const struct rtable *head)
1082 {
1083         int length = 0;
1084         const struct rtable *rth = head;
1085
1086         while (rth) {
1087                 length += has_noalias(head, rth);
1088                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1089         }
1090         return length >> FRACT_BITS;
1091 }
1092
1093 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1094                                            struct sk_buff *skb,
1095                                            const void *daddr)
1096 {
1097         struct net_device *dev = dst->dev;
1098         const __be32 *pkey = daddr;
1099         const struct rtable *rt;
1100         struct neighbour *n;
1101
1102         rt = (const struct rtable *) dst;
1103         if (rt->rt_gateway)
1104                 pkey = (const __be32 *) &rt->rt_gateway;
1105         else if (skb)
1106                 pkey = &ip_hdr(skb)->daddr;
1107
1108         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1109         if (n)
1110                 return n;
1111         return neigh_create(&arp_tbl, pkey, dev);
1112 }
1113
1114 static int rt_bind_neighbour(struct rtable *rt)
1115 {
1116         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
1117         if (IS_ERR(n))
1118                 return PTR_ERR(n);
1119         dst_set_neighbour(&rt->dst, n);
1120
1121         return 0;
1122 }
1123
1124 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1125                                      struct sk_buff *skb, int ifindex)
1126 {
1127         struct rtable   *rth, *cand;
1128         struct rtable __rcu **rthp, **candp;
1129         unsigned long   now;
1130         u32             min_score;
1131         int             chain_length;
1132         int attempts = !in_softirq();
1133
1134 restart:
1135         chain_length = 0;
1136         min_score = ~(u32)0;
1137         cand = NULL;
1138         candp = NULL;
1139         now = jiffies;
1140
1141         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1142                 /*
1143                  * If we're not caching, just tell the caller we
1144                  * were successful and don't touch the route.  The
1145                  * caller hold the sole reference to the cache entry, and
1146                  * it will be released when the caller is done with it.
1147                  * If we drop it here, the callers have no way to resolve routes
1148                  * when we're not caching.  Instead, just point *rp at rt, so
1149                  * the caller gets a single use out of the route
1150                  * Note that we do rt_free on this new route entry, so that
1151                  * once its refcount hits zero, we are still able to reap it
1152                  * (Thanks Alexey)
1153                  * Note: To avoid expensive rcu stuff for this uncached dst,
1154                  * we set DST_NOCACHE so that dst_release() can free dst without
1155                  * waiting a grace period.
1156                  */
1157
1158                 rt->dst.flags |= DST_NOCACHE;
1159                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1160                         int err = rt_bind_neighbour(rt);
1161                         if (err) {
1162                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1163                                 ip_rt_put(rt);
1164                                 return ERR_PTR(err);
1165                         }
1166                 }
1167
1168                 goto skip_hashing;
1169         }
1170
1171         rthp = &rt_hash_table[hash].chain;
1172
1173         spin_lock_bh(rt_hash_lock_addr(hash));
1174         while ((rth = rcu_dereference_protected(*rthp,
1175                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1176                 if (rt_is_expired(rth)) {
1177                         *rthp = rth->dst.rt_next;
1178                         rt_free(rth);
1179                         continue;
1180                 }
1181                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1182                         /* Put it first */
1183                         *rthp = rth->dst.rt_next;
1184                         /*
1185                          * Since lookup is lockfree, the deletion
1186                          * must be visible to another weakly ordered CPU before
1187                          * the insertion at the start of the hash chain.
1188                          */
1189                         rcu_assign_pointer(rth->dst.rt_next,
1190                                            rt_hash_table[hash].chain);
1191                         /*
1192                          * Since lookup is lockfree, the update writes
1193                          * must be ordered for consistency on SMP.
1194                          */
1195                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1196
1197                         dst_use(&rth->dst, now);
1198                         spin_unlock_bh(rt_hash_lock_addr(hash));
1199
1200                         rt_drop(rt);
1201                         if (skb)
1202                                 skb_dst_set(skb, &rth->dst);
1203                         return rth;
1204                 }
1205
1206                 if (!atomic_read(&rth->dst.__refcnt)) {
1207                         u32 score = rt_score(rth);
1208
1209                         if (score <= min_score) {
1210                                 cand = rth;
1211                                 candp = rthp;
1212                                 min_score = score;
1213                         }
1214                 }
1215
1216                 chain_length++;
1217
1218                 rthp = &rth->dst.rt_next;
1219         }
1220
1221         if (cand) {
1222                 /* ip_rt_gc_elasticity used to be average length of chain
1223                  * length, when exceeded gc becomes really aggressive.
1224                  *
1225                  * The second limit is less certain. At the moment it allows
1226                  * only 2 entries per bucket. We will see.
1227                  */
1228                 if (chain_length > ip_rt_gc_elasticity) {
1229                         *candp = cand->dst.rt_next;
1230                         rt_free(cand);
1231                 }
1232         } else {
1233                 if (chain_length > rt_chain_length_max &&
1234                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1235                         struct net *net = dev_net(rt->dst.dev);
1236                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1237                         if (!rt_caching(net)) {
1238                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1239                                         rt->dst.dev->name, num);
1240                         }
1241                         rt_emergency_hash_rebuild(net);
1242                         spin_unlock_bh(rt_hash_lock_addr(hash));
1243
1244                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1245                                         ifindex, rt_genid(net));
1246                         goto restart;
1247                 }
1248         }
1249
1250         /* Try to bind route to arp only if it is output
1251            route or unicast forwarding path.
1252          */
1253         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1254                 int err = rt_bind_neighbour(rt);
1255                 if (err) {
1256                         spin_unlock_bh(rt_hash_lock_addr(hash));
1257
1258                         if (err != -ENOBUFS) {
1259                                 rt_drop(rt);
1260                                 return ERR_PTR(err);
1261                         }
1262
1263                         /* Neighbour tables are full and nothing
1264                            can be released. Try to shrink route cache,
1265                            it is most likely it holds some neighbour records.
1266                          */
1267                         if (attempts-- > 0) {
1268                                 int saved_elasticity = ip_rt_gc_elasticity;
1269                                 int saved_int = ip_rt_gc_min_interval;
1270                                 ip_rt_gc_elasticity     = 1;
1271                                 ip_rt_gc_min_interval   = 0;
1272                                 rt_garbage_collect(&ipv4_dst_ops);
1273                                 ip_rt_gc_min_interval   = saved_int;
1274                                 ip_rt_gc_elasticity     = saved_elasticity;
1275                                 goto restart;
1276                         }
1277
1278                         net_warn_ratelimited("Neighbour table overflow\n");
1279                         rt_drop(rt);
1280                         return ERR_PTR(-ENOBUFS);
1281                 }
1282         }
1283
1284         rt->dst.rt_next = rt_hash_table[hash].chain;
1285
1286         /*
1287          * Since lookup is lockfree, we must make sure
1288          * previous writes to rt are committed to memory
1289          * before making rt visible to other CPUS.
1290          */
1291         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1292
1293         spin_unlock_bh(rt_hash_lock_addr(hash));
1294
1295 skip_hashing:
1296         if (skb)
1297                 skb_dst_set(skb, &rt->dst);
1298         return rt;
1299 }
1300
1301 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1302
1303 static u32 rt_peer_genid(void)
1304 {
1305         return atomic_read(&__rt_peer_genid);
1306 }
1307
1308 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1309 {
1310         struct inet_peer_base *base;
1311         struct inet_peer *peer;
1312
1313         base = inetpeer_base_ptr(rt->_peer);
1314         if (!base)
1315                 return;
1316
1317         peer = inet_getpeer_v4(base, daddr, create);
1318         if (peer) {
1319                 if (!rt_set_peer(rt, peer))
1320                         inet_putpeer(peer);
1321                 else
1322                         rt->rt_peer_genid = rt_peer_genid();
1323         }
1324 }
1325
1326 /*
1327  * Peer allocation may fail only in serious out-of-memory conditions.  However
1328  * we still can generate some output.
1329  * Random ID selection looks a bit dangerous because we have no chances to
1330  * select ID being unique in a reasonable period of time.
1331  * But broken packet identifier may be better than no packet at all.
1332  */
1333 static void ip_select_fb_ident(struct iphdr *iph)
1334 {
1335         static DEFINE_SPINLOCK(ip_fb_id_lock);
1336         static u32 ip_fallback_id;
1337         u32 salt;
1338
1339         spin_lock_bh(&ip_fb_id_lock);
1340         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1341         iph->id = htons(salt & 0xFFFF);
1342         ip_fallback_id = salt;
1343         spin_unlock_bh(&ip_fb_id_lock);
1344 }
1345
1346 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1347 {
1348         struct rtable *rt = (struct rtable *) dst;
1349
1350         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1351                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1352
1353                 /* If peer is attached to destination, it is never detached,
1354                    so that we need not to grab a lock to dereference it.
1355                  */
1356                 if (peer) {
1357                         iph->id = htons(inet_getid(peer, more));
1358                         return;
1359                 }
1360         } else if (!rt)
1361                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1362
1363         ip_select_fb_ident(iph);
1364 }
1365 EXPORT_SYMBOL(__ip_select_ident);
1366
1367 static void rt_del(unsigned int hash, struct rtable *rt)
1368 {
1369         struct rtable __rcu **rthp;
1370         struct rtable *aux;
1371
1372         rthp = &rt_hash_table[hash].chain;
1373         spin_lock_bh(rt_hash_lock_addr(hash));
1374         ip_rt_put(rt);
1375         while ((aux = rcu_dereference_protected(*rthp,
1376                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1377                 if (aux == rt || rt_is_expired(aux)) {
1378                         *rthp = aux->dst.rt_next;
1379                         rt_free(aux);
1380                         continue;
1381                 }
1382                 rthp = &aux->dst.rt_next;
1383         }
1384         spin_unlock_bh(rt_hash_lock_addr(hash));
1385 }
1386
1387 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1388 {
1389         struct rtable *rt = (struct rtable *) dst;
1390         __be32 orig_gw = rt->rt_gateway;
1391         struct neighbour *n, *old_n;
1392
1393         dst_confirm(&rt->dst);
1394
1395         rt->rt_gateway = peer->redirect_learned.a4;
1396
1397         n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
1398         if (IS_ERR(n)) {
1399                 rt->rt_gateway = orig_gw;
1400                 return;
1401         }
1402         old_n = xchg(&rt->dst._neighbour, n);
1403         if (old_n)
1404                 neigh_release(old_n);
1405         if (!(n->nud_state & NUD_VALID)) {
1406                 neigh_event_send(n, NULL);
1407         } else {
1408                 rt->rt_flags |= RTCF_REDIRECTED;
1409                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1410         }
1411 }
1412
1413 /* called in rcu_read_lock() section */
1414 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1415                     __be32 saddr, struct net_device *dev)
1416 {
1417         int s, i;
1418         struct in_device *in_dev = __in_dev_get_rcu(dev);
1419         __be32 skeys[2] = { saddr, 0 };
1420         int    ikeys[2] = { dev->ifindex, 0 };
1421         struct inet_peer *peer;
1422         struct net *net;
1423
1424         if (!in_dev)
1425                 return;
1426
1427         net = dev_net(dev);
1428         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1429             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1430             ipv4_is_zeronet(new_gw))
1431                 goto reject_redirect;
1432
1433         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1434                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1435                         goto reject_redirect;
1436                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1437                         goto reject_redirect;
1438         } else {
1439                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1440                         goto reject_redirect;
1441         }
1442
1443         for (s = 0; s < 2; s++) {
1444                 for (i = 0; i < 2; i++) {
1445                         unsigned int hash;
1446                         struct rtable __rcu **rthp;
1447                         struct rtable *rt;
1448
1449                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1450
1451                         rthp = &rt_hash_table[hash].chain;
1452
1453                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1454                                 rthp = &rt->dst.rt_next;
1455
1456                                 if (rt->rt_key_dst != daddr ||
1457                                     rt->rt_key_src != skeys[s] ||
1458                                     rt->rt_oif != ikeys[i] ||
1459                                     rt_is_input_route(rt) ||
1460                                     rt_is_expired(rt) ||
1461                                     !net_eq(dev_net(rt->dst.dev), net) ||
1462                                     rt->dst.error ||
1463                                     rt->dst.dev != dev ||
1464                                     rt->rt_gateway != old_gw)
1465                                         continue;
1466
1467                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1468                                 if (peer) {
1469                                         if (peer->redirect_learned.a4 != new_gw) {
1470                                                 peer->redirect_learned.a4 = new_gw;
1471                                                 atomic_inc(&__rt_peer_genid);
1472                                         }
1473                                         check_peer_redir(&rt->dst, peer);
1474                                 }
1475                         }
1476                 }
1477         }
1478         return;
1479
1480 reject_redirect:
1481 #ifdef CONFIG_IP_ROUTE_VERBOSE
1482         if (IN_DEV_LOG_MARTIANS(in_dev))
1483                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1484                                      "  Advised path = %pI4 -> %pI4\n",
1485                                      &old_gw, dev->name, &new_gw,
1486                                      &saddr, &daddr);
1487 #endif
1488         ;
1489 }
1490
1491 static bool peer_pmtu_expired(struct inet_peer *peer)
1492 {
1493         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1494
1495         return orig &&
1496                time_after_eq(jiffies, orig) &&
1497                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1498 }
1499
1500 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1501 {
1502         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1503
1504         return orig &&
1505                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1506 }
1507
1508 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1509 {
1510         struct rtable *rt = (struct rtable *)dst;
1511         struct dst_entry *ret = dst;
1512
1513         if (rt) {
1514                 if (dst->obsolete > 0) {
1515                         ip_rt_put(rt);
1516                         ret = NULL;
1517                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1518                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1519                                                 rt->rt_oif,
1520                                                 rt_genid(dev_net(dst->dev)));
1521                         rt_del(hash, rt);
1522                         ret = NULL;
1523                 } else if (rt_has_peer(rt)) {
1524                         struct inet_peer *peer = rt_peer_ptr(rt);
1525                         if (peer_pmtu_expired(peer))
1526                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1527                 }
1528         }
1529         return ret;
1530 }
1531
1532 /*
1533  * Algorithm:
1534  *      1. The first ip_rt_redirect_number redirects are sent
1535  *         with exponential backoff, then we stop sending them at all,
1536  *         assuming that the host ignores our redirects.
1537  *      2. If we did not see packets requiring redirects
1538  *         during ip_rt_redirect_silence, we assume that the host
1539  *         forgot redirected route and start to send redirects again.
1540  *
1541  * This algorithm is much cheaper and more intelligent than dumb load limiting
1542  * in icmp.c.
1543  *
1544  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1545  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1546  */
1547
1548 void ip_rt_send_redirect(struct sk_buff *skb)
1549 {
1550         struct rtable *rt = skb_rtable(skb);
1551         struct in_device *in_dev;
1552         struct inet_peer *peer;
1553         int log_martians;
1554
1555         rcu_read_lock();
1556         in_dev = __in_dev_get_rcu(rt->dst.dev);
1557         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1558                 rcu_read_unlock();
1559                 return;
1560         }
1561         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1562         rcu_read_unlock();
1563
1564         peer = rt_get_peer_create(rt, rt->rt_dst);
1565         if (!peer) {
1566                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1567                 return;
1568         }
1569
1570         /* No redirected packets during ip_rt_redirect_silence;
1571          * reset the algorithm.
1572          */
1573         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1574                 peer->rate_tokens = 0;
1575
1576         /* Too many ignored redirects; do not send anything
1577          * set dst.rate_last to the last seen redirected packet.
1578          */
1579         if (peer->rate_tokens >= ip_rt_redirect_number) {
1580                 peer->rate_last = jiffies;
1581                 return;
1582         }
1583
1584         /* Check for load limit; set rate_last to the latest sent
1585          * redirect.
1586          */
1587         if (peer->rate_tokens == 0 ||
1588             time_after(jiffies,
1589                        (peer->rate_last +
1590                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1591                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1592                 peer->rate_last = jiffies;
1593                 ++peer->rate_tokens;
1594 #ifdef CONFIG_IP_ROUTE_VERBOSE
1595                 if (log_martians &&
1596                     peer->rate_tokens == ip_rt_redirect_number)
1597                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1598                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1599                                              &rt->rt_dst, &rt->rt_gateway);
1600 #endif
1601         }
1602 }
1603
1604 static int ip_error(struct sk_buff *skb)
1605 {
1606         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1607         struct rtable *rt = skb_rtable(skb);
1608         struct inet_peer *peer;
1609         unsigned long now;
1610         struct net *net;
1611         bool send;
1612         int code;
1613
1614         net = dev_net(rt->dst.dev);
1615         if (!IN_DEV_FORWARD(in_dev)) {
1616                 switch (rt->dst.error) {
1617                 case EHOSTUNREACH:
1618                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1619                         break;
1620
1621                 case ENETUNREACH:
1622                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1623                         break;
1624                 }
1625                 goto out;
1626         }
1627
1628         switch (rt->dst.error) {
1629         case EINVAL:
1630         default:
1631                 goto out;
1632         case EHOSTUNREACH:
1633                 code = ICMP_HOST_UNREACH;
1634                 break;
1635         case ENETUNREACH:
1636                 code = ICMP_NET_UNREACH;
1637                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1638                 break;
1639         case EACCES:
1640                 code = ICMP_PKT_FILTERED;
1641                 break;
1642         }
1643
1644         peer = rt_get_peer_create(rt, rt->rt_dst);
1645
1646         send = true;
1647         if (peer) {
1648                 now = jiffies;
1649                 peer->rate_tokens += now - peer->rate_last;
1650                 if (peer->rate_tokens > ip_rt_error_burst)
1651                         peer->rate_tokens = ip_rt_error_burst;
1652                 peer->rate_last = now;
1653                 if (peer->rate_tokens >= ip_rt_error_cost)
1654                         peer->rate_tokens -= ip_rt_error_cost;
1655                 else
1656                         send = false;
1657         }
1658         if (send)
1659                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1660
1661 out:    kfree_skb(skb);
1662         return 0;
1663 }
1664
1665 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1666 {
1667         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1668
1669         if (!expires)
1670                 return;
1671         if (time_before(jiffies, expires)) {
1672                 u32 orig_dst_mtu = dst_mtu(dst);
1673                 if (peer->pmtu_learned < orig_dst_mtu) {
1674                         if (!peer->pmtu_orig)
1675                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1676                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1677                 }
1678         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1679                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1680 }
1681
1682 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1683 {
1684         struct rtable *rt = (struct rtable *) dst;
1685         struct inet_peer *peer;
1686
1687         dst_confirm(dst);
1688
1689         peer = rt_get_peer_create(rt, rt->rt_dst);
1690         if (peer) {
1691                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1692
1693                 if (mtu < ip_rt_min_pmtu)
1694                         mtu = ip_rt_min_pmtu;
1695                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1696
1697                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1698                         if (!pmtu_expires)
1699                                 pmtu_expires = 1UL;
1700
1701                         peer->pmtu_learned = mtu;
1702                         peer->pmtu_expires = pmtu_expires;
1703
1704                         atomic_inc(&__rt_peer_genid);
1705                         rt->rt_peer_genid = rt_peer_genid();
1706                 }
1707                 check_peer_pmtu(dst, peer);
1708         }
1709 }
1710
1711 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1712                       int oif, u32 mark, u8 protocol, int flow_flags)
1713 {
1714         const struct iphdr *iph = (const struct iphdr *)skb->data;
1715         struct flowi4 fl4;
1716         struct rtable *rt;
1717
1718         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1719                            protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
1720                            iph->daddr, iph->saddr, 0, 0);
1721         rt = __ip_route_output_key(net, &fl4);
1722         if (!IS_ERR(rt)) {
1723                 ip_rt_update_pmtu(&rt->dst, mtu);
1724                 ip_rt_put(rt);
1725         }
1726 }
1727 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1728
1729 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1730 {
1731         const struct inet_sock *inet = inet_sk(sk);
1732
1733         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1734                                 sk->sk_bound_dev_if, sk->sk_mark,
1735                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1736                                 inet_sk_flowi_flags(sk));
1737 }
1738 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1739
1740 static void ipv4_validate_peer(struct rtable *rt)
1741 {
1742         if (rt->rt_peer_genid != rt_peer_genid()) {
1743                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1744
1745                 if (peer) {
1746                         check_peer_pmtu(&rt->dst, peer);
1747
1748                         if (peer->redirect_learned.a4 &&
1749                             peer->redirect_learned.a4 != rt->rt_gateway)
1750                                 check_peer_redir(&rt->dst, peer);
1751                 }
1752
1753                 rt->rt_peer_genid = rt_peer_genid();
1754         }
1755 }
1756
1757 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1758 {
1759         struct rtable *rt = (struct rtable *) dst;
1760
1761         if (rt_is_expired(rt))
1762                 return NULL;
1763         ipv4_validate_peer(rt);
1764         return dst;
1765 }
1766
1767 static void ipv4_dst_destroy(struct dst_entry *dst)
1768 {
1769         struct rtable *rt = (struct rtable *) dst;
1770
1771         if (rt->fi) {
1772                 fib_info_put(rt->fi);
1773                 rt->fi = NULL;
1774         }
1775         if (rt_has_peer(rt)) {
1776                 struct inet_peer *peer = rt_peer_ptr(rt);
1777                 inet_putpeer(peer);
1778         }
1779 }
1780
1781
1782 static void ipv4_link_failure(struct sk_buff *skb)
1783 {
1784         struct rtable *rt;
1785
1786         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1787
1788         rt = skb_rtable(skb);
1789         if (rt && rt_has_peer(rt)) {
1790                 struct inet_peer *peer = rt_peer_ptr(rt);
1791                 if (peer_pmtu_cleaned(peer))
1792                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1793         }
1794 }
1795
1796 static int ip_rt_bug(struct sk_buff *skb)
1797 {
1798         pr_debug("%s: %pI4 -> %pI4, %s\n",
1799                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1800                  skb->dev ? skb->dev->name : "?");
1801         kfree_skb(skb);
1802         WARN_ON(1);
1803         return 0;
1804 }
1805
1806 /*
1807    We do not cache source address of outgoing interface,
1808    because it is used only by IP RR, TS and SRR options,
1809    so that it out of fast path.
1810
1811    BTW remember: "addr" is allowed to be not aligned
1812    in IP options!
1813  */
1814
1815 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1816 {
1817         __be32 src;
1818
1819         if (rt_is_output_route(rt))
1820                 src = ip_hdr(skb)->saddr;
1821         else {
1822                 struct fib_result res;
1823                 struct flowi4 fl4;
1824                 struct iphdr *iph;
1825
1826                 iph = ip_hdr(skb);
1827
1828                 memset(&fl4, 0, sizeof(fl4));
1829                 fl4.daddr = iph->daddr;
1830                 fl4.saddr = iph->saddr;
1831                 fl4.flowi4_tos = RT_TOS(iph->tos);
1832                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1833                 fl4.flowi4_iif = skb->dev->ifindex;
1834                 fl4.flowi4_mark = skb->mark;
1835
1836                 rcu_read_lock();
1837                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1838                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1839                 else
1840                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1841                                         RT_SCOPE_UNIVERSE);
1842                 rcu_read_unlock();
1843         }
1844         memcpy(addr, &src, 4);
1845 }
1846
1847 #ifdef CONFIG_IP_ROUTE_CLASSID
1848 static void set_class_tag(struct rtable *rt, u32 tag)
1849 {
1850         if (!(rt->dst.tclassid & 0xFFFF))
1851                 rt->dst.tclassid |= tag & 0xFFFF;
1852         if (!(rt->dst.tclassid & 0xFFFF0000))
1853                 rt->dst.tclassid |= tag & 0xFFFF0000;
1854 }
1855 #endif
1856
1857 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1858 {
1859         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1860
1861         if (advmss == 0) {
1862                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1863                                ip_rt_min_advmss);
1864                 if (advmss > 65535 - 40)
1865                         advmss = 65535 - 40;
1866         }
1867         return advmss;
1868 }
1869
1870 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1871 {
1872         const struct rtable *rt = (const struct rtable *) dst;
1873         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1874
1875         if (mtu && rt_is_output_route(rt))
1876                 return mtu;
1877
1878         mtu = dst->dev->mtu;
1879
1880         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1881
1882                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1883                         mtu = 576;
1884         }
1885
1886         if (mtu > IP_MAX_MTU)
1887                 mtu = IP_MAX_MTU;
1888
1889         return mtu;
1890 }
1891
1892 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1893                             struct fib_info *fi)
1894 {
1895         struct inet_peer_base *base;
1896         struct inet_peer *peer;
1897         int create = 0;
1898
1899         /* If a peer entry exists for this destination, we must hook
1900          * it up in order to get at cached metrics.
1901          */
1902         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1903                 create = 1;
1904
1905         base = inetpeer_base_ptr(rt->_peer);
1906         BUG_ON(!base);
1907
1908         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1909         if (peer) {
1910                 __rt_set_peer(rt, peer);
1911                 rt->rt_peer_genid = rt_peer_genid();
1912                 if (inet_metrics_new(peer))
1913                         memcpy(peer->metrics, fi->fib_metrics,
1914                                sizeof(u32) * RTAX_MAX);
1915                 dst_init_metrics(&rt->dst, peer->metrics, false);
1916
1917                 check_peer_pmtu(&rt->dst, peer);
1918
1919                 if (peer->redirect_learned.a4 &&
1920                     peer->redirect_learned.a4 != rt->rt_gateway) {
1921                         rt->rt_gateway = peer->redirect_learned.a4;
1922                         rt->rt_flags |= RTCF_REDIRECTED;
1923                 }
1924         } else {
1925                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1926                         rt->fi = fi;
1927                         atomic_inc(&fi->fib_clntref);
1928                 }
1929                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1930         }
1931 }
1932
1933 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1934                            const struct fib_result *res,
1935                            struct fib_info *fi, u16 type, u32 itag)
1936 {
1937         struct dst_entry *dst = &rt->dst;
1938
1939         if (fi) {
1940                 if (FIB_RES_GW(*res) &&
1941                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1942                         rt->rt_gateway = FIB_RES_GW(*res);
1943                 rt_init_metrics(rt, fl4, fi);
1944 #ifdef CONFIG_IP_ROUTE_CLASSID
1945                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1946 #endif
1947         }
1948
1949         if (dst_mtu(dst) > IP_MAX_MTU)
1950                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1951
1952 #ifdef CONFIG_IP_ROUTE_CLASSID
1953 #ifdef CONFIG_IP_MULTIPLE_TABLES
1954         set_class_tag(rt, fib_rules_tclass(res));
1955 #endif
1956         set_class_tag(rt, itag);
1957 #endif
1958 }
1959
1960 static struct rtable *rt_dst_alloc(struct net_device *dev,
1961                                    bool nopolicy, bool noxfrm)
1962 {
1963         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1964                          DST_HOST |
1965                          (nopolicy ? DST_NOPOLICY : 0) |
1966                          (noxfrm ? DST_NOXFRM : 0));
1967 }
1968
1969 /* called in rcu_read_lock() section */
1970 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1971                                 u8 tos, struct net_device *dev, int our)
1972 {
1973         unsigned int hash;
1974         struct rtable *rth;
1975         struct in_device *in_dev = __in_dev_get_rcu(dev);
1976         u32 itag = 0;
1977         int err;
1978
1979         /* Primary sanity checks. */
1980
1981         if (in_dev == NULL)
1982                 return -EINVAL;
1983
1984         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1985             skb->protocol != htons(ETH_P_IP))
1986                 goto e_inval;
1987
1988         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1989                 if (ipv4_is_loopback(saddr))
1990                         goto e_inval;
1991
1992         if (ipv4_is_zeronet(saddr)) {
1993                 if (!ipv4_is_local_multicast(daddr))
1994                         goto e_inval;
1995         } else {
1996                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1997                                           in_dev, &itag);
1998                 if (err < 0)
1999                         goto e_err;
2000         }
2001         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2002                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2003         if (!rth)
2004                 goto e_nobufs;
2005
2006 #ifdef CONFIG_IP_ROUTE_CLASSID
2007         rth->dst.tclassid = itag;
2008 #endif
2009         rth->dst.output = ip_rt_bug;
2010
2011         rth->rt_key_dst = daddr;
2012         rth->rt_key_src = saddr;
2013         rth->rt_genid   = rt_genid(dev_net(dev));
2014         rth->rt_flags   = RTCF_MULTICAST;
2015         rth->rt_type    = RTN_MULTICAST;
2016         rth->rt_key_tos = tos;
2017         rth->rt_dst     = daddr;
2018         rth->rt_src     = saddr;
2019         rth->rt_route_iif = dev->ifindex;
2020         rth->rt_iif     = dev->ifindex;
2021         rth->rt_oif     = 0;
2022         rth->rt_mark    = skb->mark;
2023         rth->rt_gateway = daddr;
2024         rth->rt_peer_genid = 0;
2025         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2026         rth->fi = NULL;
2027         if (our) {
2028                 rth->dst.input= ip_local_deliver;
2029                 rth->rt_flags |= RTCF_LOCAL;
2030         }
2031
2032 #ifdef CONFIG_IP_MROUTE
2033         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2034                 rth->dst.input = ip_mr_input;
2035 #endif
2036         RT_CACHE_STAT_INC(in_slow_mc);
2037
2038         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2039         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2040         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2041
2042 e_nobufs:
2043         return -ENOBUFS;
2044 e_inval:
2045         return -EINVAL;
2046 e_err:
2047         return err;
2048 }
2049
2050
2051 static void ip_handle_martian_source(struct net_device *dev,
2052                                      struct in_device *in_dev,
2053                                      struct sk_buff *skb,
2054                                      __be32 daddr,
2055                                      __be32 saddr)
2056 {
2057         RT_CACHE_STAT_INC(in_martian_src);
2058 #ifdef CONFIG_IP_ROUTE_VERBOSE
2059         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2060                 /*
2061                  *      RFC1812 recommendation, if source is martian,
2062                  *      the only hint is MAC header.
2063                  */
2064                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2065                         &daddr, &saddr, dev->name);
2066                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2067                         print_hex_dump(KERN_WARNING, "ll header: ",
2068                                        DUMP_PREFIX_OFFSET, 16, 1,
2069                                        skb_mac_header(skb),
2070                                        dev->hard_header_len, true);
2071                 }
2072         }
2073 #endif
2074 }
2075
2076 /* called in rcu_read_lock() section */
2077 static int __mkroute_input(struct sk_buff *skb,
2078                            const struct fib_result *res,
2079                            struct in_device *in_dev,
2080                            __be32 daddr, __be32 saddr, u32 tos,
2081                            struct rtable **result)
2082 {
2083         struct rtable *rth;
2084         int err;
2085         struct in_device *out_dev;
2086         unsigned int flags = 0;
2087         u32 itag;
2088
2089         /* get a working reference to the output device */
2090         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2091         if (out_dev == NULL) {
2092                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2093                 return -EINVAL;
2094         }
2095
2096
2097         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2098                                   in_dev->dev, in_dev, &itag);
2099         if (err < 0) {
2100                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2101                                          saddr);
2102
2103                 goto cleanup;
2104         }
2105
2106         if (err)
2107                 flags |= RTCF_DIRECTSRC;
2108
2109         if (out_dev == in_dev && err &&
2110             (IN_DEV_SHARED_MEDIA(out_dev) ||
2111              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2112                 flags |= RTCF_DOREDIRECT;
2113
2114         if (skb->protocol != htons(ETH_P_IP)) {
2115                 /* Not IP (i.e. ARP). Do not create route, if it is
2116                  * invalid for proxy arp. DNAT routes are always valid.
2117                  *
2118                  * Proxy arp feature have been extended to allow, ARP
2119                  * replies back to the same interface, to support
2120                  * Private VLAN switch technologies. See arp.c.
2121                  */
2122                 if (out_dev == in_dev &&
2123                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2124                         err = -EINVAL;
2125                         goto cleanup;
2126                 }
2127         }
2128
2129         rth = rt_dst_alloc(out_dev->dev,
2130                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2131                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2132         if (!rth) {
2133                 err = -ENOBUFS;
2134                 goto cleanup;
2135         }
2136
2137         rth->rt_key_dst = daddr;
2138         rth->rt_key_src = saddr;
2139         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2140         rth->rt_flags = flags;
2141         rth->rt_type = res->type;
2142         rth->rt_key_tos = tos;
2143         rth->rt_dst     = daddr;
2144         rth->rt_src     = saddr;
2145         rth->rt_route_iif = in_dev->dev->ifindex;
2146         rth->rt_iif     = in_dev->dev->ifindex;
2147         rth->rt_oif     = 0;
2148         rth->rt_mark    = skb->mark;
2149         rth->rt_gateway = daddr;
2150         rth->rt_peer_genid = 0;
2151         rt_init_peer(rth, &res->table->tb_peers);
2152         rth->fi = NULL;
2153
2154         rth->dst.input = ip_forward;
2155         rth->dst.output = ip_output;
2156
2157         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2158
2159         *result = rth;
2160         err = 0;
2161  cleanup:
2162         return err;
2163 }
2164
2165 static int ip_mkroute_input(struct sk_buff *skb,
2166                             struct fib_result *res,
2167                             const struct flowi4 *fl4,
2168                             struct in_device *in_dev,
2169                             __be32 daddr, __be32 saddr, u32 tos)
2170 {
2171         struct rtable *rth = NULL;
2172         int err;
2173         unsigned int hash;
2174
2175 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2176         if (res->fi && res->fi->fib_nhs > 1)
2177                 fib_select_multipath(res);
2178 #endif
2179
2180         /* create a routing cache entry */
2181         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2182         if (err)
2183                 return err;
2184
2185         /* put it into the cache */
2186         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2187                        rt_genid(dev_net(rth->dst.dev)));
2188         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2189         if (IS_ERR(rth))
2190                 return PTR_ERR(rth);
2191         return 0;
2192 }
2193
2194 /*
2195  *      NOTE. We drop all the packets that has local source
2196  *      addresses, because every properly looped back packet
2197  *      must have correct destination already attached by output routine.
2198  *
2199  *      Such approach solves two big problems:
2200  *      1. Not simplex devices are handled properly.
2201  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2202  *      called with rcu_read_lock()
2203  */
2204
2205 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2206                                u8 tos, struct net_device *dev)
2207 {
2208         struct fib_result res;
2209         struct in_device *in_dev = __in_dev_get_rcu(dev);
2210         struct flowi4   fl4;
2211         unsigned int    flags = 0;
2212         u32             itag = 0;
2213         struct rtable   *rth;
2214         unsigned int    hash;
2215         int             err = -EINVAL;
2216         struct net    *net = dev_net(dev);
2217
2218         /* IP on this device is disabled. */
2219
2220         if (!in_dev)
2221                 goto out;
2222
2223         /* Check for the most weird martians, which can be not detected
2224            by fib_lookup.
2225          */
2226
2227         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2228                 goto martian_source;
2229
2230         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2231                 goto brd_input;
2232
2233         /* Accept zero addresses only to limited broadcast;
2234          * I even do not know to fix it or not. Waiting for complains :-)
2235          */
2236         if (ipv4_is_zeronet(saddr))
2237                 goto martian_source;
2238
2239         if (ipv4_is_zeronet(daddr))
2240                 goto martian_destination;
2241
2242         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2243                 if (ipv4_is_loopback(daddr))
2244                         goto martian_destination;
2245
2246                 if (ipv4_is_loopback(saddr))
2247                         goto martian_source;
2248         }
2249
2250         /*
2251          *      Now we are ready to route packet.
2252          */
2253         fl4.flowi4_oif = 0;
2254         fl4.flowi4_iif = dev->ifindex;
2255         fl4.flowi4_mark = skb->mark;
2256         fl4.flowi4_tos = tos;
2257         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2258         fl4.daddr = daddr;
2259         fl4.saddr = saddr;
2260         err = fib_lookup(net, &fl4, &res);
2261         if (err != 0)
2262                 goto no_route;
2263
2264         RT_CACHE_STAT_INC(in_slow_tot);
2265
2266         if (res.type == RTN_BROADCAST)
2267                 goto brd_input;
2268
2269         if (res.type == RTN_LOCAL) {
2270                 err = fib_validate_source(skb, saddr, daddr, tos,
2271                                           net->loopback_dev->ifindex,
2272                                           dev, in_dev, &itag);
2273                 if (err < 0)
2274                         goto martian_source_keep_err;
2275                 if (err)
2276                         flags |= RTCF_DIRECTSRC;
2277                 goto local_input;
2278         }
2279
2280         if (!IN_DEV_FORWARD(in_dev))
2281                 goto no_route;
2282         if (res.type != RTN_UNICAST)
2283                 goto martian_destination;
2284
2285         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2286 out:    return err;
2287
2288 brd_input:
2289         if (skb->protocol != htons(ETH_P_IP))
2290                 goto e_inval;
2291
2292         if (!ipv4_is_zeronet(saddr)) {
2293                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2294                                           in_dev, &itag);
2295                 if (err < 0)
2296                         goto martian_source_keep_err;
2297                 if (err)
2298                         flags |= RTCF_DIRECTSRC;
2299         }
2300         flags |= RTCF_BROADCAST;
2301         res.type = RTN_BROADCAST;
2302         RT_CACHE_STAT_INC(in_brd);
2303
2304 local_input:
2305         rth = rt_dst_alloc(net->loopback_dev,
2306                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2307         if (!rth)
2308                 goto e_nobufs;
2309
2310         rth->dst.input= ip_local_deliver;
2311         rth->dst.output= ip_rt_bug;
2312 #ifdef CONFIG_IP_ROUTE_CLASSID
2313         rth->dst.tclassid = itag;
2314 #endif
2315
2316         rth->rt_key_dst = daddr;
2317         rth->rt_key_src = saddr;
2318         rth->rt_genid = rt_genid(net);
2319         rth->rt_flags   = flags|RTCF_LOCAL;
2320         rth->rt_type    = res.type;
2321         rth->rt_key_tos = tos;
2322         rth->rt_dst     = daddr;
2323         rth->rt_src     = saddr;
2324         rth->rt_route_iif = dev->ifindex;
2325         rth->rt_iif     = dev->ifindex;
2326         rth->rt_oif     = 0;
2327         rth->rt_mark    = skb->mark;
2328         rth->rt_gateway = daddr;
2329         rth->rt_peer_genid = 0;
2330         rt_init_peer(rth, net->ipv4.peers);
2331         rth->fi = NULL;
2332         if (res.type == RTN_UNREACHABLE) {
2333                 rth->dst.input= ip_error;
2334                 rth->dst.error= -err;
2335                 rth->rt_flags   &= ~RTCF_LOCAL;
2336         }
2337         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2338         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2339         err = 0;
2340         if (IS_ERR(rth))
2341                 err = PTR_ERR(rth);
2342         goto out;
2343
2344 no_route:
2345         RT_CACHE_STAT_INC(in_no_route);
2346         res.type = RTN_UNREACHABLE;
2347         if (err == -ESRCH)
2348                 err = -ENETUNREACH;
2349         goto local_input;
2350
2351         /*
2352          *      Do not cache martian addresses: they should be logged (RFC1812)
2353          */
2354 martian_destination:
2355         RT_CACHE_STAT_INC(in_martian_dst);
2356 #ifdef CONFIG_IP_ROUTE_VERBOSE
2357         if (IN_DEV_LOG_MARTIANS(in_dev))
2358                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2359                                      &daddr, &saddr, dev->name);
2360 #endif
2361
2362 e_inval:
2363         err = -EINVAL;
2364         goto out;
2365
2366 e_nobufs:
2367         err = -ENOBUFS;
2368         goto out;
2369
2370 martian_source:
2371         err = -EINVAL;
2372 martian_source_keep_err:
2373         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2374         goto out;
2375 }
2376
2377 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2378                            u8 tos, struct net_device *dev, bool noref)
2379 {
2380         struct rtable   *rth;
2381         unsigned int    hash;
2382         int iif = dev->ifindex;
2383         struct net *net;
2384         int res;
2385
2386         net = dev_net(dev);
2387
2388         rcu_read_lock();
2389
2390         if (!rt_caching(net))
2391                 goto skip_cache;
2392
2393         tos &= IPTOS_RT_MASK;
2394         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2395
2396         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2397              rth = rcu_dereference(rth->dst.rt_next)) {
2398                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2399                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2400                      (rth->rt_route_iif ^ iif) |
2401                      (rth->rt_key_tos ^ tos)) == 0 &&
2402                     rth->rt_mark == skb->mark &&
2403                     net_eq(dev_net(rth->dst.dev), net) &&
2404                     !rt_is_expired(rth)) {
2405                         ipv4_validate_peer(rth);
2406                         if (noref) {
2407                                 dst_use_noref(&rth->dst, jiffies);
2408                                 skb_dst_set_noref(skb, &rth->dst);
2409                         } else {
2410                                 dst_use(&rth->dst, jiffies);
2411                                 skb_dst_set(skb, &rth->dst);
2412                         }
2413                         RT_CACHE_STAT_INC(in_hit);
2414                         rcu_read_unlock();
2415                         return 0;
2416                 }
2417                 RT_CACHE_STAT_INC(in_hlist_search);
2418         }
2419
2420 skip_cache:
2421         /* Multicast recognition logic is moved from route cache to here.
2422            The problem was that too many Ethernet cards have broken/missing
2423            hardware multicast filters :-( As result the host on multicasting
2424            network acquires a lot of useless route cache entries, sort of
2425            SDR messages from all the world. Now we try to get rid of them.
2426            Really, provided software IP multicast filter is organized
2427            reasonably (at least, hashed), it does not result in a slowdown
2428            comparing with route cache reject entries.
2429            Note, that multicast routers are not affected, because
2430            route cache entry is created eventually.
2431          */
2432         if (ipv4_is_multicast(daddr)) {
2433                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2434
2435                 if (in_dev) {
2436                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2437                                                   ip_hdr(skb)->protocol);
2438                         if (our
2439 #ifdef CONFIG_IP_MROUTE
2440                                 ||
2441                             (!ipv4_is_local_multicast(daddr) &&
2442                              IN_DEV_MFORWARD(in_dev))
2443 #endif
2444                            ) {
2445                                 int res = ip_route_input_mc(skb, daddr, saddr,
2446                                                             tos, dev, our);
2447                                 rcu_read_unlock();
2448                                 return res;
2449                         }
2450                 }
2451                 rcu_read_unlock();
2452                 return -EINVAL;
2453         }
2454         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2455         rcu_read_unlock();
2456         return res;
2457 }
2458 EXPORT_SYMBOL(ip_route_input_common);
2459
2460 /* called with rcu_read_lock() */
2461 static struct rtable *__mkroute_output(const struct fib_result *res,
2462                                        const struct flowi4 *fl4,
2463                                        __be32 orig_daddr, __be32 orig_saddr,
2464                                        int orig_oif, __u8 orig_rtos,
2465                                        struct net_device *dev_out,
2466                                        unsigned int flags)
2467 {
2468         struct fib_info *fi = res->fi;
2469         struct in_device *in_dev;
2470         u16 type = res->type;
2471         struct rtable *rth;
2472
2473         in_dev = __in_dev_get_rcu(dev_out);
2474         if (!in_dev)
2475                 return ERR_PTR(-EINVAL);
2476
2477         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2478                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2479                         return ERR_PTR(-EINVAL);
2480
2481         if (ipv4_is_lbcast(fl4->daddr))
2482                 type = RTN_BROADCAST;
2483         else if (ipv4_is_multicast(fl4->daddr))
2484                 type = RTN_MULTICAST;
2485         else if (ipv4_is_zeronet(fl4->daddr))
2486                 return ERR_PTR(-EINVAL);
2487
2488         if (dev_out->flags & IFF_LOOPBACK)
2489                 flags |= RTCF_LOCAL;
2490
2491         if (type == RTN_BROADCAST) {
2492                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2493                 fi = NULL;
2494         } else if (type == RTN_MULTICAST) {
2495                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2496                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2497                                      fl4->flowi4_proto))
2498                         flags &= ~RTCF_LOCAL;
2499                 /* If multicast route do not exist use
2500                  * default one, but do not gateway in this case.
2501                  * Yes, it is hack.
2502                  */
2503                 if (fi && res->prefixlen < 4)
2504                         fi = NULL;
2505         }
2506
2507         rth = rt_dst_alloc(dev_out,
2508                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2509                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2510         if (!rth)
2511                 return ERR_PTR(-ENOBUFS);
2512
2513         rth->dst.output = ip_output;
2514
2515         rth->rt_key_dst = orig_daddr;
2516         rth->rt_key_src = orig_saddr;
2517         rth->rt_genid = rt_genid(dev_net(dev_out));
2518         rth->rt_flags   = flags;
2519         rth->rt_type    = type;
2520         rth->rt_key_tos = orig_rtos;
2521         rth->rt_dst     = fl4->daddr;
2522         rth->rt_src     = fl4->saddr;
2523         rth->rt_route_iif = 0;
2524         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2525         rth->rt_oif     = orig_oif;
2526         rth->rt_mark    = fl4->flowi4_mark;
2527         rth->rt_gateway = fl4->daddr;
2528         rth->rt_peer_genid = 0;
2529         rt_init_peer(rth, (res->table ?
2530                            &res->table->tb_peers :
2531                            dev_net(dev_out)->ipv4.peers));
2532         rth->fi = NULL;
2533
2534         RT_CACHE_STAT_INC(out_slow_tot);
2535
2536         if (flags & RTCF_LOCAL)
2537                 rth->dst.input = ip_local_deliver;
2538         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2539                 if (flags & RTCF_LOCAL &&
2540                     !(dev_out->flags & IFF_LOOPBACK)) {
2541                         rth->dst.output = ip_mc_output;
2542                         RT_CACHE_STAT_INC(out_slow_mc);
2543                 }
2544 #ifdef CONFIG_IP_MROUTE
2545                 if (type == RTN_MULTICAST) {
2546                         if (IN_DEV_MFORWARD(in_dev) &&
2547                             !ipv4_is_local_multicast(fl4->daddr)) {
2548                                 rth->dst.input = ip_mr_input;
2549                                 rth->dst.output = ip_mc_output;
2550                         }
2551                 }
2552 #endif
2553         }
2554
2555         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2556
2557         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2558                 rth->dst.flags |= DST_NOCACHE;
2559
2560         return rth;
2561 }
2562
2563 /*
2564  * Major route resolver routine.
2565  * called with rcu_read_lock();
2566  */
2567
2568 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2569 {
2570         struct net_device *dev_out = NULL;
2571         __u8 tos = RT_FL_TOS(fl4);
2572         unsigned int flags = 0;
2573         struct fib_result res;
2574         struct rtable *rth;
2575         __be32 orig_daddr;
2576         __be32 orig_saddr;
2577         int orig_oif;
2578
2579         res.fi          = NULL;
2580         res.table       = NULL;
2581 #ifdef CONFIG_IP_MULTIPLE_TABLES
2582         res.r           = NULL;
2583 #endif
2584
2585         orig_daddr = fl4->daddr;
2586         orig_saddr = fl4->saddr;
2587         orig_oif = fl4->flowi4_oif;
2588
2589         fl4->flowi4_iif = net->loopback_dev->ifindex;
2590         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2591         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2592                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2593
2594         rcu_read_lock();
2595         if (fl4->saddr) {
2596                 rth = ERR_PTR(-EINVAL);
2597                 if (ipv4_is_multicast(fl4->saddr) ||
2598                     ipv4_is_lbcast(fl4->saddr) ||
2599                     ipv4_is_zeronet(fl4->saddr))
2600                         goto out;
2601
2602                 /* I removed check for oif == dev_out->oif here.
2603                    It was wrong for two reasons:
2604                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2605                       is assigned to multiple interfaces.
2606                    2. Moreover, we are allowed to send packets with saddr
2607                       of another iface. --ANK
2608                  */
2609
2610                 if (fl4->flowi4_oif == 0 &&
2611                     (ipv4_is_multicast(fl4->daddr) ||
2612                      ipv4_is_lbcast(fl4->daddr))) {
2613                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2614                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2615                         if (dev_out == NULL)
2616                                 goto out;
2617
2618                         /* Special hack: user can direct multicasts
2619                            and limited broadcast via necessary interface
2620                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2621                            This hack is not just for fun, it allows
2622                            vic,vat and friends to work.
2623                            They bind socket to loopback, set ttl to zero
2624                            and expect that it will work.
2625                            From the viewpoint of routing cache they are broken,
2626                            because we are not allowed to build multicast path
2627                            with loopback source addr (look, routing cache
2628                            cannot know, that ttl is zero, so that packet
2629                            will not leave this host and route is valid).
2630                            Luckily, this hack is good workaround.
2631                          */
2632
2633                         fl4->flowi4_oif = dev_out->ifindex;
2634                         goto make_route;
2635                 }
2636
2637                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2638                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2639                         if (!__ip_dev_find(net, fl4->saddr, false))
2640                                 goto out;
2641                 }
2642         }
2643
2644
2645         if (fl4->flowi4_oif) {
2646                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2647                 rth = ERR_PTR(-ENODEV);
2648                 if (dev_out == NULL)
2649                         goto out;
2650
2651                 /* RACE: Check return value of inet_select_addr instead. */
2652                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2653                         rth = ERR_PTR(-ENETUNREACH);
2654                         goto out;
2655                 }
2656                 if (ipv4_is_local_multicast(fl4->daddr) ||
2657                     ipv4_is_lbcast(fl4->daddr)) {
2658                         if (!fl4->saddr)
2659                                 fl4->saddr = inet_select_addr(dev_out, 0,
2660                                                               RT_SCOPE_LINK);
2661                         goto make_route;
2662                 }
2663                 if (fl4->saddr) {
2664                         if (ipv4_is_multicast(fl4->daddr))
2665                                 fl4->saddr = inet_select_addr(dev_out, 0,
2666                                                               fl4->flowi4_scope);
2667                         else if (!fl4->daddr)
2668                                 fl4->saddr = inet_select_addr(dev_out, 0,
2669                                                               RT_SCOPE_HOST);
2670                 }
2671         }
2672
2673         if (!fl4->daddr) {
2674                 fl4->daddr = fl4->saddr;
2675                 if (!fl4->daddr)
2676                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2677                 dev_out = net->loopback_dev;
2678                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2679                 res.type = RTN_LOCAL;
2680                 flags |= RTCF_LOCAL;
2681                 goto make_route;
2682         }
2683
2684         if (fib_lookup(net, fl4, &res)) {
2685                 res.fi = NULL;
2686                 res.table = NULL;
2687                 if (fl4->flowi4_oif) {
2688                         /* Apparently, routing tables are wrong. Assume,
2689                            that the destination is on link.
2690
2691                            WHY? DW.
2692                            Because we are allowed to send to iface
2693                            even if it has NO routes and NO assigned
2694                            addresses. When oif is specified, routing
2695                            tables are looked up with only one purpose:
2696                            to catch if destination is gatewayed, rather than
2697                            direct. Moreover, if MSG_DONTROUTE is set,
2698                            we send packet, ignoring both routing tables
2699                            and ifaddr state. --ANK
2700
2701
2702                            We could make it even if oif is unknown,
2703                            likely IPv6, but we do not.
2704                          */
2705
2706                         if (fl4->saddr == 0)
2707                                 fl4->saddr = inet_select_addr(dev_out, 0,
2708                                                               RT_SCOPE_LINK);
2709                         res.type = RTN_UNICAST;
2710                         goto make_route;
2711                 }
2712                 rth = ERR_PTR(-ENETUNREACH);
2713                 goto out;
2714         }
2715
2716         if (res.type == RTN_LOCAL) {
2717                 if (!fl4->saddr) {
2718                         if (res.fi->fib_prefsrc)
2719                                 fl4->saddr = res.fi->fib_prefsrc;
2720                         else
2721                                 fl4->saddr = fl4->daddr;
2722                 }
2723                 dev_out = net->loopback_dev;
2724                 fl4->flowi4_oif = dev_out->ifindex;
2725                 res.fi = NULL;
2726                 flags |= RTCF_LOCAL;
2727                 goto make_route;
2728         }
2729
2730 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2731         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2732                 fib_select_multipath(&res);
2733         else
2734 #endif
2735         if (!res.prefixlen &&
2736             res.table->tb_num_default > 1 &&
2737             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2738                 fib_select_default(&res);
2739
2740         if (!fl4->saddr)
2741                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2742
2743         dev_out = FIB_RES_DEV(res);
2744         fl4->flowi4_oif = dev_out->ifindex;
2745
2746
2747 make_route:
2748         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2749                                tos, dev_out, flags);
2750         if (!IS_ERR(rth)) {
2751                 unsigned int hash;
2752
2753                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2754                                rt_genid(dev_net(dev_out)));
2755                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2756         }
2757
2758 out:
2759         rcu_read_unlock();
2760         return rth;
2761 }
2762
2763 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2764 {
2765         struct rtable *rth;
2766         unsigned int hash;
2767
2768         if (!rt_caching(net))
2769                 goto slow_output;
2770
2771         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2772
2773         rcu_read_lock_bh();
2774         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2775                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2776                 if (rth->rt_key_dst == flp4->daddr &&
2777                     rth->rt_key_src == flp4->saddr &&
2778                     rt_is_output_route(rth) &&
2779                     rth->rt_oif == flp4->flowi4_oif &&
2780                     rth->rt_mark == flp4->flowi4_mark &&
2781                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2782                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2783                     net_eq(dev_net(rth->dst.dev), net) &&
2784                     !rt_is_expired(rth)) {
2785                         ipv4_validate_peer(rth);
2786                         dst_use(&rth->dst, jiffies);
2787                         RT_CACHE_STAT_INC(out_hit);
2788                         rcu_read_unlock_bh();
2789                         if (!flp4->saddr)
2790                                 flp4->saddr = rth->rt_src;
2791                         if (!flp4->daddr)
2792                                 flp4->daddr = rth->rt_dst;
2793                         return rth;
2794                 }
2795                 RT_CACHE_STAT_INC(out_hlist_search);
2796         }
2797         rcu_read_unlock_bh();
2798
2799 slow_output:
2800         return ip_route_output_slow(net, flp4);
2801 }
2802 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2803
2804 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2805 {
2806         return NULL;
2807 }
2808
2809 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2810 {
2811         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2812
2813         return mtu ? : dst->dev->mtu;
2814 }
2815
2816 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2817 {
2818 }
2819
2820 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2821                                           unsigned long old)
2822 {
2823         return NULL;
2824 }
2825
2826 static struct dst_ops ipv4_dst_blackhole_ops = {
2827         .family                 =       AF_INET,
2828         .protocol               =       cpu_to_be16(ETH_P_IP),
2829         .destroy                =       ipv4_dst_destroy,
2830         .check                  =       ipv4_blackhole_dst_check,
2831         .mtu                    =       ipv4_blackhole_mtu,
2832         .default_advmss         =       ipv4_default_advmss,
2833         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2834         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2835         .neigh_lookup           =       ipv4_neigh_lookup,
2836 };
2837
2838 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2839 {
2840         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2841         struct rtable *ort = (struct rtable *) dst_orig;
2842
2843         if (rt) {
2844                 struct dst_entry *new = &rt->dst;
2845
2846                 new->__use = 1;
2847                 new->input = dst_discard;
2848                 new->output = dst_discard;
2849                 dst_copy_metrics(new, &ort->dst);
2850
2851                 new->dev = ort->dst.dev;
2852                 if (new->dev)
2853                         dev_hold(new->dev);
2854
2855                 rt->rt_key_dst = ort->rt_key_dst;
2856                 rt->rt_key_src = ort->rt_key_src;
2857                 rt->rt_key_tos = ort->rt_key_tos;
2858                 rt->rt_route_iif = ort->rt_route_iif;
2859                 rt->rt_iif = ort->rt_iif;
2860                 rt->rt_oif = ort->rt_oif;
2861                 rt->rt_mark = ort->rt_mark;
2862
2863                 rt->rt_genid = rt_genid(net);
2864                 rt->rt_flags = ort->rt_flags;
2865                 rt->rt_type = ort->rt_type;
2866                 rt->rt_dst = ort->rt_dst;
2867                 rt->rt_src = ort->rt_src;
2868                 rt->rt_gateway = ort->rt_gateway;
2869                 rt_transfer_peer(rt, ort);
2870                 rt->fi = ort->fi;
2871                 if (rt->fi)
2872                         atomic_inc(&rt->fi->fib_clntref);
2873
2874                 dst_free(new);
2875         }
2876
2877         dst_release(dst_orig);
2878
2879         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2880 }
2881
2882 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2883                                     struct sock *sk)
2884 {
2885         struct rtable *rt = __ip_route_output_key(net, flp4);
2886
2887         if (IS_ERR(rt))
2888                 return rt;
2889
2890         if (flp4->flowi4_proto)
2891                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2892                                                    flowi4_to_flowi(flp4),
2893                                                    sk, 0);
2894
2895         return rt;
2896 }
2897 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2898
2899 static int rt_fill_info(struct net *net,
2900                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2901                         int nowait, unsigned int flags)
2902 {
2903         struct rtable *rt = skb_rtable(skb);
2904         struct rtmsg *r;
2905         struct nlmsghdr *nlh;
2906         unsigned long expires = 0;
2907         u32 id = 0, ts = 0, tsage = 0, error;
2908
2909         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2910         if (nlh == NULL)
2911                 return -EMSGSIZE;
2912
2913         r = nlmsg_data(nlh);
2914         r->rtm_family    = AF_INET;
2915         r->rtm_dst_len  = 32;
2916         r->rtm_src_len  = 0;
2917         r->rtm_tos      = rt->rt_key_tos;
2918         r->rtm_table    = RT_TABLE_MAIN;
2919         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2920                 goto nla_put_failure;
2921         r->rtm_type     = rt->rt_type;
2922         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2923         r->rtm_protocol = RTPROT_UNSPEC;
2924         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2925         if (rt->rt_flags & RTCF_NOTIFY)
2926                 r->rtm_flags |= RTM_F_NOTIFY;
2927
2928         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2929                 goto nla_put_failure;
2930         if (rt->rt_key_src) {
2931                 r->rtm_src_len = 32;
2932                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2933                         goto nla_put_failure;
2934         }
2935         if (rt->dst.dev &&
2936             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2937                 goto nla_put_failure;
2938 #ifdef CONFIG_IP_ROUTE_CLASSID
2939         if (rt->dst.tclassid &&
2940             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2941                 goto nla_put_failure;
2942 #endif
2943         if (!rt_is_input_route(rt) &&
2944             rt->rt_src != rt->rt_key_src) {
2945                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2946                         goto nla_put_failure;
2947         }
2948         if (rt->rt_dst != rt->rt_gateway &&
2949             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2950                 goto nla_put_failure;
2951
2952         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2953                 goto nla_put_failure;
2954
2955         if (rt->rt_mark &&
2956             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2957                 goto nla_put_failure;
2958
2959         error = rt->dst.error;
2960         if (rt_has_peer(rt)) {
2961                 const struct inet_peer *peer = rt_peer_ptr(rt);
2962                 inet_peer_refcheck(peer);
2963                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2964                 if (peer->tcp_ts_stamp) {
2965                         ts = peer->tcp_ts;
2966                         tsage = get_seconds() - peer->tcp_ts_stamp;
2967                 }
2968                 expires = ACCESS_ONCE(peer->pmtu_expires);
2969                 if (expires) {
2970                         if (time_before(jiffies, expires))
2971                                 expires -= jiffies;
2972                         else
2973                                 expires = 0;
2974                 }
2975         }
2976
2977         if (rt_is_input_route(rt)) {
2978 #ifdef CONFIG_IP_MROUTE
2979                 __be32 dst = rt->rt_dst;
2980
2981                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2982                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2983                         int err = ipmr_get_route(net, skb,
2984                                                  rt->rt_src, rt->rt_dst,
2985                                                  r, nowait);
2986                         if (err <= 0) {
2987                                 if (!nowait) {
2988                                         if (err == 0)
2989                                                 return 0;
2990                                         goto nla_put_failure;
2991                                 } else {
2992                                         if (err == -EMSGSIZE)
2993                                                 goto nla_put_failure;
2994                                         error = err;
2995                                 }
2996                         }
2997                 } else
2998 #endif
2999                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3000                                 goto nla_put_failure;
3001         }
3002
3003         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3004                                expires, error) < 0)
3005                 goto nla_put_failure;
3006
3007         return nlmsg_end(skb, nlh);
3008
3009 nla_put_failure:
3010         nlmsg_cancel(skb, nlh);
3011         return -EMSGSIZE;
3012 }
3013
3014 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3015 {
3016         struct net *net = sock_net(in_skb->sk);
3017         struct rtmsg *rtm;
3018         struct nlattr *tb[RTA_MAX+1];
3019         struct rtable *rt = NULL;
3020         __be32 dst = 0;
3021         __be32 src = 0;
3022         u32 iif;
3023         int err;
3024         int mark;
3025         struct sk_buff *skb;
3026
3027         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3028         if (err < 0)
3029                 goto errout;
3030
3031         rtm = nlmsg_data(nlh);
3032
3033         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3034         if (skb == NULL) {
3035                 err = -ENOBUFS;
3036                 goto errout;
3037         }
3038
3039         /* Reserve room for dummy headers, this skb can pass
3040            through good chunk of routing engine.
3041          */
3042         skb_reset_mac_header(skb);
3043         skb_reset_network_header(skb);
3044
3045         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3046         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3047         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3048
3049         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3050         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3051         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3052         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3053
3054         if (iif) {
3055                 struct net_device *dev;
3056
3057                 dev = __dev_get_by_index(net, iif);
3058                 if (dev == NULL) {
3059                         err = -ENODEV;
3060                         goto errout_free;
3061                 }
3062
3063                 skb->protocol   = htons(ETH_P_IP);
3064                 skb->dev        = dev;
3065                 skb->mark       = mark;
3066                 local_bh_disable();
3067                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3068                 local_bh_enable();
3069
3070                 rt = skb_rtable(skb);
3071                 if (err == 0 && rt->dst.error)
3072                         err = -rt->dst.error;
3073         } else {
3074                 struct flowi4 fl4 = {
3075                         .daddr = dst,
3076                         .saddr = src,
3077                         .flowi4_tos = rtm->rtm_tos,
3078                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3079                         .flowi4_mark = mark,
3080                 };
3081                 rt = ip_route_output_key(net, &fl4);
3082
3083                 err = 0;
3084                 if (IS_ERR(rt))
3085                         err = PTR_ERR(rt);
3086         }
3087
3088         if (err)
3089                 goto errout_free;
3090
3091         skb_dst_set(skb, &rt->dst);
3092         if (rtm->rtm_flags & RTM_F_NOTIFY)
3093                 rt->rt_flags |= RTCF_NOTIFY;
3094
3095         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3096                            RTM_NEWROUTE, 0, 0);
3097         if (err <= 0)
3098                 goto errout_free;
3099
3100         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3101 errout:
3102         return err;
3103
3104 errout_free:
3105         kfree_skb(skb);
3106         goto errout;
3107 }
3108
3109 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3110 {
3111         struct rtable *rt;
3112         int h, s_h;
3113         int idx, s_idx;
3114         struct net *net;
3115
3116         net = sock_net(skb->sk);
3117
3118         s_h = cb->args[0];
3119         if (s_h < 0)
3120                 s_h = 0;
3121         s_idx = idx = cb->args[1];
3122         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3123                 if (!rt_hash_table[h].chain)
3124                         continue;
3125                 rcu_read_lock_bh();
3126                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3127                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3128                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3129                                 continue;
3130                         if (rt_is_expired(rt))
3131                                 continue;
3132                         skb_dst_set_noref(skb, &rt->dst);
3133                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3134                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3135                                          1, NLM_F_MULTI) <= 0) {
3136                                 skb_dst_drop(skb);
3137                                 rcu_read_unlock_bh();
3138                                 goto done;
3139                         }
3140                         skb_dst_drop(skb);
3141                 }
3142                 rcu_read_unlock_bh();
3143         }
3144
3145 done:
3146         cb->args[0] = h;
3147         cb->args[1] = idx;
3148         return skb->len;
3149 }
3150
3151 void ip_rt_multicast_event(struct in_device *in_dev)
3152 {
3153         rt_cache_flush(dev_net(in_dev->dev), 0);
3154 }
3155
3156 #ifdef CONFIG_SYSCTL
3157 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3158                                         void __user *buffer,
3159                                         size_t *lenp, loff_t *ppos)
3160 {
3161         if (write) {
3162                 int flush_delay;
3163                 ctl_table ctl;
3164                 struct net *net;
3165
3166                 memcpy(&ctl, __ctl, sizeof(ctl));
3167                 ctl.data = &flush_delay;
3168                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3169
3170                 net = (struct net *)__ctl->extra1;
3171                 rt_cache_flush(net, flush_delay);
3172                 return 0;
3173         }
3174
3175         return -EINVAL;
3176 }
3177
3178 static ctl_table ipv4_route_table[] = {
3179         {
3180                 .procname       = "gc_thresh",
3181                 .data           = &ipv4_dst_ops.gc_thresh,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec,
3185         },
3186         {
3187                 .procname       = "max_size",
3188                 .data           = &ip_rt_max_size,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec,
3192         },
3193         {
3194                 /*  Deprecated. Use gc_min_interval_ms */
3195
3196                 .procname       = "gc_min_interval",
3197                 .data           = &ip_rt_gc_min_interval,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec_jiffies,
3201         },
3202         {
3203                 .procname       = "gc_min_interval_ms",
3204                 .data           = &ip_rt_gc_min_interval,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec_ms_jiffies,
3208         },
3209         {
3210                 .procname       = "gc_timeout",
3211                 .data           = &ip_rt_gc_timeout,
3212                 .maxlen         = sizeof(int),
3213                 .mode           = 0644,
3214                 .proc_handler   = proc_dointvec_jiffies,
3215         },
3216         {
3217                 .procname       = "gc_interval",
3218                 .data           = &ip_rt_gc_interval,
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0644,
3221                 .proc_handler   = proc_dointvec_jiffies,
3222         },
3223         {
3224                 .procname       = "redirect_load",
3225                 .data           = &ip_rt_redirect_load,
3226                 .maxlen         = sizeof(int),
3227                 .mode           = 0644,
3228                 .proc_handler   = proc_dointvec,
3229         },
3230         {
3231                 .procname       = "redirect_number",
3232                 .data           = &ip_rt_redirect_number,
3233                 .maxlen         = sizeof(int),
3234                 .mode           = 0644,
3235                 .proc_handler   = proc_dointvec,
3236         },
3237         {
3238                 .procname       = "redirect_silence",
3239                 .data           = &ip_rt_redirect_silence,
3240                 .maxlen         = sizeof(int),
3241                 .mode           = 0644,
3242                 .proc_handler   = proc_dointvec,
3243         },
3244         {
3245                 .procname       = "error_cost",
3246                 .data           = &ip_rt_error_cost,
3247                 .maxlen         = sizeof(int),
3248                 .mode           = 0644,
3249                 .proc_handler   = proc_dointvec,
3250         },
3251         {
3252                 .procname       = "error_burst",
3253                 .data           = &ip_rt_error_burst,
3254                 .maxlen         = sizeof(int),
3255                 .mode           = 0644,
3256                 .proc_handler   = proc_dointvec,
3257         },
3258         {
3259                 .procname       = "gc_elasticity",
3260                 .data           = &ip_rt_gc_elasticity,
3261                 .maxlen         = sizeof(int),
3262                 .mode           = 0644,
3263                 .proc_handler   = proc_dointvec,
3264         },
3265         {
3266                 .procname       = "mtu_expires",
3267                 .data           = &ip_rt_mtu_expires,
3268                 .maxlen         = sizeof(int),
3269                 .mode           = 0644,
3270                 .proc_handler   = proc_dointvec_jiffies,
3271         },
3272         {
3273                 .procname       = "min_pmtu",
3274                 .data           = &ip_rt_min_pmtu,
3275                 .maxlen         = sizeof(int),
3276                 .mode           = 0644,
3277                 .proc_handler   = proc_dointvec,
3278         },
3279         {
3280                 .procname       = "min_adv_mss",
3281                 .data           = &ip_rt_min_advmss,
3282                 .maxlen         = sizeof(int),
3283                 .mode           = 0644,
3284                 .proc_handler   = proc_dointvec,
3285         },
3286         { }
3287 };
3288
3289 static struct ctl_table ipv4_route_flush_table[] = {
3290         {
3291                 .procname       = "flush",
3292                 .maxlen         = sizeof(int),
3293                 .mode           = 0200,
3294                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3295         },
3296         { },
3297 };
3298
3299 static __net_init int sysctl_route_net_init(struct net *net)
3300 {
3301         struct ctl_table *tbl;
3302
3303         tbl = ipv4_route_flush_table;
3304         if (!net_eq(net, &init_net)) {
3305                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306                 if (tbl == NULL)
3307                         goto err_dup;
3308         }
3309         tbl[0].extra1 = net;
3310
3311         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3312         if (net->ipv4.route_hdr == NULL)
3313                 goto err_reg;
3314         return 0;
3315
3316 err_reg:
3317         if (tbl != ipv4_route_flush_table)
3318                 kfree(tbl);
3319 err_dup:
3320         return -ENOMEM;
3321 }
3322
3323 static __net_exit void sysctl_route_net_exit(struct net *net)
3324 {
3325         struct ctl_table *tbl;
3326
3327         tbl = net->ipv4.route_hdr->ctl_table_arg;
3328         unregister_net_sysctl_table(net->ipv4.route_hdr);
3329         BUG_ON(tbl == ipv4_route_flush_table);
3330         kfree(tbl);
3331 }
3332
3333 static __net_initdata struct pernet_operations sysctl_route_ops = {
3334         .init = sysctl_route_net_init,
3335         .exit = sysctl_route_net_exit,
3336 };
3337 #endif
3338
3339 static __net_init int rt_genid_init(struct net *net)
3340 {
3341         get_random_bytes(&net->ipv4.rt_genid,
3342                          sizeof(net->ipv4.rt_genid));
3343         get_random_bytes(&net->ipv4.dev_addr_genid,
3344                          sizeof(net->ipv4.dev_addr_genid));
3345         return 0;
3346 }
3347
3348 static __net_initdata struct pernet_operations rt_genid_ops = {
3349         .init = rt_genid_init,
3350 };
3351
3352 static int __net_init ipv4_inetpeer_init(struct net *net)
3353 {
3354         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3355
3356         if (!bp)
3357                 return -ENOMEM;
3358         inet_peer_base_init(bp);
3359         net->ipv4.peers = bp;
3360         return 0;
3361 }
3362
3363 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3364 {
3365         struct inet_peer_base *bp = net->ipv4.peers;
3366
3367         net->ipv4.peers = NULL;
3368         inetpeer_invalidate_tree(bp);
3369         kfree(bp);
3370 }
3371
3372 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3373         .init   =       ipv4_inetpeer_init,
3374         .exit   =       ipv4_inetpeer_exit,
3375 };
3376
3377 #ifdef CONFIG_IP_ROUTE_CLASSID
3378 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3379 #endif /* CONFIG_IP_ROUTE_CLASSID */
3380
3381 static __initdata unsigned long rhash_entries;
3382 static int __init set_rhash_entries(char *str)
3383 {
3384         ssize_t ret;
3385
3386         if (!str)
3387                 return 0;
3388
3389         ret = kstrtoul(str, 0, &rhash_entries);
3390         if (ret)
3391                 return 0;
3392
3393         return 1;
3394 }
3395 __setup("rhash_entries=", set_rhash_entries);
3396
3397 int __init ip_rt_init(void)
3398 {
3399         int rc = 0;
3400
3401 #ifdef CONFIG_IP_ROUTE_CLASSID
3402         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3403         if (!ip_rt_acct)
3404                 panic("IP: failed to allocate ip_rt_acct\n");
3405 #endif
3406
3407         ipv4_dst_ops.kmem_cachep =
3408                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3409                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3410
3411         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3412
3413         if (dst_entries_init(&ipv4_dst_ops) < 0)
3414                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3415
3416         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3417                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3418
3419         rt_hash_table = (struct rt_hash_bucket *)
3420                 alloc_large_system_hash("IP route cache",
3421                                         sizeof(struct rt_hash_bucket),
3422                                         rhash_entries,
3423                                         (totalram_pages >= 128 * 1024) ?
3424                                         15 : 17,
3425                                         0,
3426                                         &rt_hash_log,
3427                                         &rt_hash_mask,
3428                                         0,
3429                                         rhash_entries ? 0 : 512 * 1024);
3430         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3431         rt_hash_lock_init();
3432
3433         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3434         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3435
3436         devinet_init();
3437         ip_fib_init();
3438
3439         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3440         expires_ljiffies = jiffies;
3441         schedule_delayed_work(&expires_work,
3442                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3443
3444         if (ip_rt_proc_init())
3445                 pr_err("Unable to create route proc files\n");
3446 #ifdef CONFIG_XFRM
3447         xfrm_init();
3448         xfrm4_init(ip_rt_max_size);
3449 #endif
3450         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3451
3452 #ifdef CONFIG_SYSCTL
3453         register_pernet_subsys(&sysctl_route_ops);
3454 #endif
3455         register_pernet_subsys(&rt_genid_ops);
3456         register_pernet_subsys(&ipv4_inetpeer_ops);
3457         return rc;
3458 }
3459
3460 #ifdef CONFIG_SYSCTL
3461 /*
3462  * We really need to sanitize the damn ipv4 init order, then all
3463  * this nonsense will go away.
3464  */
3465 void __init ip_static_sysctl_init(void)
3466 {
3467         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3468 }
3469 #endif