net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375         else
 376                 return pt->dev ? &pt->dev->ptype_specific :
 377                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378 }
 379
 380 /**
 381  *      dev_add_pack - add packet handler
 382  *      @pt: packet type declaration
 383  *
 384  *      Add a protocol handler to the networking stack. The passed &packet_type
 385  *      is linked into kernel lists and may not be freed until it has been
 386  *      removed from the kernel lists.
 387  *
 388  *      This call does not sleep therefore it can not
 389  *      guarantee all CPU's that are in middle of receiving packets
 390  *      will see the new packet type (until the next received packet).
 391  */
 392
 393 void dev_add_pack(struct packet_type *pt)
 394 {
 395         struct list_head *head = ptype_head(pt);
 396
 397         spin_lock(&ptype_lock);
 398         list_add_rcu(&pt->list, head);
 399         spin_unlock(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head = ptype_head(pt);
 419         struct packet_type *pt1;
 420
 421         spin_lock(&ptype_lock);
 422
 423         list_for_each_entry(pt1, head, list) {
 424                 if (pt == pt1) {
 425                         list_del_rcu(&pt->list);
 426                         goto out;
 427                 }
 428         }
 429
 430         pr_warn("dev_remove_pack: %p not found\n", pt);
 431 out:
 432         spin_unlock(&ptype_lock);
 433 }
 434 EXPORT_SYMBOL(__dev_remove_pack);
 435
 436 /**
 437  *      dev_remove_pack  - remove packet handler
 438  *      @pt: packet type declaration
 439  *
 440  *      Remove a protocol handler that was previously added to the kernel
 441  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442  *      from the kernel lists and can be freed or reused once this function
 443  *      returns.
 444  *
 445  *      This call sleeps to guarantee that no CPU is looking at the packet
 446  *      type after return.
 447  */
 448 void dev_remove_pack(struct packet_type *pt)
 449 {
 450         __dev_remove_pack(pt);
 451
 452         synchronize_net();
 453 }
 454 EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457 /**
 458  *      dev_add_offload - register offload handlers
 459  *      @po: protocol offload declaration
 460  *
 461  *      Add protocol offload handlers to the networking stack. The passed
 462  *      &proto_offload is linked into kernel lists and may not be freed until
 463  *      it has been removed from the kernel lists.
 464  *
 465  *      This call does not sleep therefore it can not
 466  *      guarantee all CPU's that are in middle of receiving packets
 467  *      will see the new offload handlers (until the next received packet).
 468  */
 469 void dev_add_offload(struct packet_offload *po)
 470 {
 471         struct list_head *head = &offload_base;
 472
 473         spin_lock(&offload_lock);
 474         list_add_rcu(&po->list, head);
 475         spin_unlock(&offload_lock);
 476 }
 477 EXPORT_SYMBOL(dev_add_offload);
 478
 479 /**
 480  *      __dev_remove_offload     - remove offload handler
 481  *      @po: packet offload declaration
 482  *
 483  *      Remove a protocol offload handler that was previously added to the
 484  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 485  *      is removed from the kernel lists and can be freed or reused once this
 486  *      function returns.
 487  *
 488  *      The packet type might still be in use by receivers
 489  *      and must not be freed until after all the CPU's have gone
 490  *      through a quiescent state.
 491  */
 492 static void __dev_remove_offload(struct packet_offload *po)
 493 {
 494         struct list_head *head = &offload_base;
 495         struct packet_offload *po1;
 496
 497         spin_lock(&offload_lock);
 498
 499         list_for_each_entry(po1, head, list) {
 500                 if (po == po1) {
 501                         list_del_rcu(&po->list);
 502                         goto out;
 503                 }
 504         }
 505
 506         pr_warn("dev_remove_offload: %p not found\n", po);
 507 out:
 508         spin_unlock(&offload_lock);
 509 }
 510
 511 /**
 512  *      dev_remove_offload       - remove packet offload handler
 513  *      @po: packet offload declaration
 514  *
 515  *      Remove a packet offload handler that was previously added to the kernel
 516  *      offload handlers by dev_add_offload(). The passed &offload_type is
 517  *      removed from the kernel lists and can be freed or reused once this
 518  *      function returns.
 519  *
 520  *      This call sleeps to guarantee that no CPU is looking at the packet
 521  *      type after return.
 522  */
 523 void dev_remove_offload(struct packet_offload *po)
 524 {
 525         __dev_remove_offload(po);
 526
 527         synchronize_net();
 528 }
 529 EXPORT_SYMBOL(dev_remove_offload);
 530
 531 /******************************************************************************
 532
 533                       Device Boot-time Settings Routines
 534
 535 *******************************************************************************/
 536
 537 /* Boot time configuration table */
 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540 /**
 541  *      netdev_boot_setup_add   - add new setup entry
 542  *      @name: name of the device
 543  *      @map: configured settings for the device
 544  *
 545  *      Adds new setup entry to the dev_boot_setup list.  The function
 546  *      returns 0 on error and 1 on success.  This is a generic routine to
 547  *      all netdevices.
 548  */
 549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550 {
 551         struct netdev_boot_setup *s;
 552         int i;
 553
 554         s = dev_boot_setup;
 555         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557                         memset(s[i].name, 0, sizeof(s[i].name));
 558                         strlcpy(s[i].name, name, IFNAMSIZ);
 559                         memcpy(&s[i].map, map, sizeof(s[i].map));
 560                         break;
 561                 }
 562         }
 563
 564         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565 }
 566
 567 /**
 568  *      netdev_boot_setup_check - check boot time settings
 569  *      @dev: the netdevice
 570  *
 571  *      Check boot time settings for the device.
 572  *      The found settings are set for the device to be used
 573  *      later in the device probing.
 574  *      Returns 0 if no settings found, 1 if they are.
 575  */
 576 int netdev_boot_setup_check(struct net_device *dev)
 577 {
 578         struct netdev_boot_setup *s = dev_boot_setup;
 579         int i;
 580
 581         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583                     !strcmp(dev->name, s[i].name)) {
 584                         dev->irq        = s[i].map.irq;
 585                         dev->base_addr  = s[i].map.base_addr;
 586                         dev->mem_start  = s[i].map.mem_start;
 587                         dev->mem_end    = s[i].map.mem_end;
 588                         return 1;
 589                 }
 590         }
 591         return 0;
 592 }
 593 EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596 /**
 597  *      netdev_boot_base        - get address from boot time settings
 598  *      @prefix: prefix for network device
 599  *      @unit: id for network device
 600  *
 601  *      Check boot time settings for the base address of device.
 602  *      The found settings are set for the device to be used
 603  *      later in the device probing.
 604  *      Returns 0 if no settings found.
 605  */
 606 unsigned long netdev_boot_base(const char *prefix, int unit)
 607 {
 608         const struct netdev_boot_setup *s = dev_boot_setup;
 609         char name[IFNAMSIZ];
 610         int i;
 611
 612         sprintf(name, "%s%d", prefix, unit);
 613
 614         /*
 615          * If device already registered then return base of 1
 616          * to indicate not to probe for this interface
 617          */
 618         if (__dev_get_by_name(&init_net, name))
 619                 return 1;
 620
 621         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622                 if (!strcmp(name, s[i].name))
 623                         return s[i].map.base_addr;
 624         return 0;
 625 }
 626
 627 /*
 628  * Saves at boot time configured settings for any netdevice.
 629  */
 630 int __init netdev_boot_setup(char *str)
 631 {
 632         int ints[5];
 633         struct ifmap map;
 634
 635         str = get_options(str, ARRAY_SIZE(ints), ints);
 636         if (!str || !*str)
 637                 return 0;
 638
 639         /* Save settings */
 640         memset(&map, 0, sizeof(map));
 641         if (ints[0] > 0)
 642                 map.irq = ints[1];
 643         if (ints[0] > 1)
 644                 map.base_addr = ints[2];
 645         if (ints[0] > 2)
 646                 map.mem_start = ints[3];
 647         if (ints[0] > 3)
 648                 map.mem_end = ints[4];
 649
 650         /* Add new entry to the list */
 651         return netdev_boot_setup_add(str, &map);
 652 }
 653
 654 __setup("netdev=", netdev_boot_setup);
 655
 656 /*******************************************************************************
 657
 658                             Device Interface Subroutines
 659
 660 *******************************************************************************/
 661
 662 /**
 663  *      __dev_get_by_name       - find a device by its name
 664  *      @net: the applicable net namespace
 665  *      @name: name to find
 666  *
 667  *      Find an interface by name. Must be called under RTNL semaphore
 668  *      or @dev_base_lock. If the name is found a pointer to the device
 669  *      is returned. If the name is not found then %NULL is returned. The
 670  *      reference counters are not incremented so the caller must be
 671  *      careful with locks.
 672  */
 673
 674 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 675 {
 676         struct net_device *dev;
 677         struct hlist_head *head = dev_name_hash(net, name);
 678
 679         hlist_for_each_entry(dev, head, name_hlist)
 680                 if (!strncmp(dev->name, name, IFNAMSIZ))
 681                         return dev;
 682
 683         return NULL;
 684 }
 685 EXPORT_SYMBOL(__dev_get_by_name);
 686
 687 /**
 688  *      dev_get_by_name_rcu     - find a device by its name
 689  *      @net: the applicable net namespace
 690  *      @name: name to find
 691  *
 692  *      Find an interface by name.
 693  *      If the name is found a pointer to the device is returned.
 694  *      If the name is not found then %NULL is returned.
 695  *      The reference counters are not incremented so the caller must be
 696  *      careful with locks. The caller must hold RCU lock.
 697  */
 698
 699 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 700 {
 701         struct net_device *dev;
 702         struct hlist_head *head = dev_name_hash(net, name);
 703
 704         hlist_for_each_entry_rcu(dev, head, name_hlist)
 705                 if (!strncmp(dev->name, name, IFNAMSIZ))
 706                         return dev;
 707
 708         return NULL;
 709 }
 710 EXPORT_SYMBOL(dev_get_by_name_rcu);
 711
 712 /**
 713  *      dev_get_by_name         - find a device by its name
 714  *      @net: the applicable net namespace
 715  *      @name: name to find
 716  *
 717  *      Find an interface by name. This can be called from any
 718  *      context and does its own locking. The returned handle has
 719  *      the usage count incremented and the caller must use dev_put() to
 720  *      release it when it is no longer needed. %NULL is returned if no
 721  *      matching device is found.
 722  */
 723
 724 struct net_device *dev_get_by_name(struct net *net, const char *name)
 725 {
 726         struct net_device *dev;
 727
 728         rcu_read_lock();
 729         dev = dev_get_by_name_rcu(net, name);
 730         if (dev)
 731                 dev_hold(dev);
 732         rcu_read_unlock();
 733         return dev;
 734 }
 735 EXPORT_SYMBOL(dev_get_by_name);
 736
 737 /**
 738  *      __dev_get_by_index - find a device by its ifindex
 739  *      @net: the applicable net namespace
 740  *      @ifindex: index of device
 741  *
 742  *      Search for an interface by index. Returns %NULL if the device
 743  *      is not found or a pointer to the device. The device has not
 744  *      had its reference counter increased so the caller must be careful
 745  *      about locking. The caller must hold either the RTNL semaphore
 746  *      or @dev_base_lock.
 747  */
 748
 749 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 750 {
 751         struct net_device *dev;
 752         struct hlist_head *head = dev_index_hash(net, ifindex);
 753
 754         hlist_for_each_entry(dev, head, index_hlist)
 755                 if (dev->ifindex == ifindex)
 756                         return dev;
 757
 758         return NULL;
 759 }
 760 EXPORT_SYMBOL(__dev_get_by_index);
 761
 762 /**
 763  *      dev_get_by_index_rcu - find a device by its ifindex
 764  *      @net: the applicable net namespace
 765  *      @ifindex: index of device
 766  *
 767  *      Search for an interface by index. Returns %NULL if the device
 768  *      is not found or a pointer to the device. The device has not
 769  *      had its reference counter increased so the caller must be careful
 770  *      about locking. The caller must hold RCU lock.
 771  */
 772
 773 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 774 {
 775         struct net_device *dev;
 776         struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778         hlist_for_each_entry_rcu(dev, head, index_hlist)
 779                 if (dev->ifindex == ifindex)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(dev_get_by_index_rcu);
 785
 786
 787 /**
 788  *      dev_get_by_index - find a device by its ifindex
 789  *      @net: the applicable net namespace
 790  *      @ifindex: index of device
 791  *
 792  *      Search for an interface by index. Returns NULL if the device
 793  *      is not found or a pointer to the device. The device returned has
 794  *      had a reference added and the pointer is safe until the user calls
 795  *      dev_put to indicate they have finished with it.
 796  */
 797
 798 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 799 {
 800         struct net_device *dev;
 801
 802         rcu_read_lock();
 803         dev = dev_get_by_index_rcu(net, ifindex);
 804         if (dev)
 805                 dev_hold(dev);
 806         rcu_read_unlock();
 807         return dev;
 808 }
 809 EXPORT_SYMBOL(dev_get_by_index);
 810
 811 /**
 812  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 813  *      @net: network namespace
 814  *      @name: a pointer to the buffer where the name will be stored.
 815  *      @ifindex: the ifindex of the interface to get the name from.
 816  *
 817  *      The use of raw_seqcount_begin() and cond_resched() before
 818  *      retrying is required as we want to give the writers a chance
 819  *      to complete when CONFIG_PREEMPT is not set.
 820  */
 821 int netdev_get_name(struct net *net, char *name, int ifindex)
 822 {
 823         struct net_device *dev;
 824         unsigned int seq;
 825
 826 retry:
 827         seq = raw_seqcount_begin(&devnet_rename_seq);
 828         rcu_read_lock();
 829         dev = dev_get_by_index_rcu(net, ifindex);
 830         if (!dev) {
 831                 rcu_read_unlock();
 832                 return -ENODEV;
 833         }
 834
 835         strcpy(name, dev->name);
 836         rcu_read_unlock();
 837         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 838                 cond_resched();
 839                 goto retry;
 840         }
 841
 842         return 0;
 843 }
 844
 845 /**
 846  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 847  *      @net: the applicable net namespace
 848  *      @type: media type of device
 849  *      @ha: hardware address
 850  *
 851  *      Search for an interface by MAC address. Returns NULL if the device
 852  *      is not found or a pointer to the device.
 853  *      The caller must hold RCU or RTNL.
 854  *      The returned device has not had its ref count increased
 855  *      and the caller must therefore be careful about locking
 856  *
 857  */
 858
 859 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 860                                        const char *ha)
 861 {
 862         struct net_device *dev;
 863
 864         for_each_netdev_rcu(net, dev)
 865                 if (dev->type == type &&
 866                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 867                         return dev;
 868
 869         return NULL;
 870 }
 871 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 872
 873 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 874 {
 875         struct net_device *dev;
 876
 877         ASSERT_RTNL();
 878         for_each_netdev(net, dev)
 879                 if (dev->type == type)
 880                         return dev;
 881
 882         return NULL;
 883 }
 884 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 885
 886 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 887 {
 888         struct net_device *dev, *ret = NULL;
 889
 890         rcu_read_lock();
 891         for_each_netdev_rcu(net, dev)
 892                 if (dev->type == type) {
 893                         dev_hold(dev);
 894                         ret = dev;
 895                         break;
 896                 }
 897         rcu_read_unlock();
 898         return ret;
 899 }
 900 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 901
 902 /**
 903  *      __dev_get_by_flags - find any device with given flags
 904  *      @net: the applicable net namespace
 905  *      @if_flags: IFF_* values
 906  *      @mask: bitmask of bits in if_flags to check
 907  *
 908  *      Search for any interface with the given flags. Returns NULL if a device
 909  *      is not found or a pointer to the device. Must be called inside
 910  *      rtnl_lock(), and result refcount is unchanged.
 911  */
 912
 913 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 914                                       unsigned short mask)
 915 {
 916         struct net_device *dev, *ret;
 917
 918         ASSERT_RTNL();
 919
 920         ret = NULL;
 921         for_each_netdev(net, dev) {
 922                 if (((dev->flags ^ if_flags) & mask) == 0) {
 923                         ret = dev;
 924                         break;
 925                 }
 926         }
 927         return ret;
 928 }
 929 EXPORT_SYMBOL(__dev_get_by_flags);
 930
 931 /**
 932  *      dev_valid_name - check if name is okay for network device
 933  *      @name: name string
 934  *
 935  *      Network device names need to be valid file names to
 936  *      to allow sysfs to work.  We also disallow any kind of
 937  *      whitespace.
 938  */
 939 bool dev_valid_name(const char *name)
 940 {
 941         if (*name == '\0')
 942                 return false;
 943         if (strlen(name) >= IFNAMSIZ)
 944                 return false;
 945         if (!strcmp(name, ".") || !strcmp(name, ".."))
 946                 return false;
 947
 948         while (*name) {
 949                 if (*name == '/' || isspace(*name))
 950                         return false;
 951                 name++;
 952         }
 953         return true;
 954 }
 955 EXPORT_SYMBOL(dev_valid_name);
 956
 957 /**
 958  *      __dev_alloc_name - allocate a name for a device
 959  *      @net: network namespace to allocate the device name in
 960  *      @name: name format string
 961  *      @buf:  scratch buffer and result name string
 962  *
 963  *      Passed a format string - eg "lt%d" it will try and find a suitable
 964  *      id. It scans list of devices to build up a free map, then chooses
 965  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 966  *      while allocating the name and adding the device in order to avoid
 967  *      duplicates.
 968  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 969  *      Returns the number of the unit assigned or a negative errno code.
 970  */
 971
 972 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 973 {
 974         int i = 0;
 975         const char *p;
 976         const int max_netdevices = 8*PAGE_SIZE;
 977         unsigned long *inuse;
 978         struct net_device *d;
 979
 980         p = strnchr(name, IFNAMSIZ-1, '%');
 981         if (p) {
 982                 /*
 983                  * Verify the string as this thing may have come from
 984                  * the user.  There must be either one "%d" and no other "%"
 985                  * characters.
 986                  */
 987                 if (p[1] != 'd' || strchr(p + 2, '%'))
 988                         return -EINVAL;
 989
 990                 /* Use one page as a bit array of possible slots */
 991                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 992                 if (!inuse)
 993                         return -ENOMEM;
 994
 995                 for_each_netdev(net, d) {
 996                         if (!sscanf(d->name, name, &i))
 997                                 continue;
 998                         if (i < 0 || i >= max_netdevices)
 999                                 continue;
1000
1001                         /*  avoid cases where sscanf is not exact inverse of printf */
1002                         snprintf(buf, IFNAMSIZ, name, i);
1003                         if (!strncmp(buf, d->name, IFNAMSIZ))
1004                                 set_bit(i, inuse);
1005                 }
1006
1007                 i = find_first_zero_bit(inuse, max_netdevices);
1008                 free_page((unsigned long) inuse);
1009         }
1010
1011         if (buf != name)
1012                 snprintf(buf, IFNAMSIZ, name, i);
1013         if (!__dev_get_by_name(net, buf))
1014                 return i;
1015
1016         /* It is possible to run out of possible slots
1017          * when the name is long and there isn't enough space left
1018          * for the digits, or if all bits are used.
1019          */
1020         return -ENFILE;
1021 }
1022
1023 /**
1024  *      dev_alloc_name - allocate a name for a device
1025  *      @dev: device
1026  *      @name: name format string
1027  *
1028  *      Passed a format string - eg "lt%d" it will try and find a suitable
1029  *      id. It scans list of devices to build up a free map, then chooses
1030  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1031  *      while allocating the name and adding the device in order to avoid
1032  *      duplicates.
1033  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1034  *      Returns the number of the unit assigned or a negative errno code.
1035  */
1036
1037 int dev_alloc_name(struct net_device *dev, const char *name)
1038 {
1039         char buf[IFNAMSIZ];
1040         struct net *net;
1041         int ret;
1042
1043         BUG_ON(!dev_net(dev));
1044         net = dev_net(dev);
1045         ret = __dev_alloc_name(net, name, buf);
1046         if (ret >= 0)
1047                 strlcpy(dev->name, buf, IFNAMSIZ);
1048         return ret;
1049 }
1050 EXPORT_SYMBOL(dev_alloc_name);
1051
1052 static int dev_alloc_name_ns(struct net *net,
1053                              struct net_device *dev,
1054                              const char *name)
1055 {
1056         char buf[IFNAMSIZ];
1057         int ret;
1058
1059         ret = __dev_alloc_name(net, name, buf);
1060         if (ret >= 0)
1061                 strlcpy(dev->name, buf, IFNAMSIZ);
1062         return ret;
1063 }
1064
1065 static int dev_get_valid_name(struct net *net,
1066                               struct net_device *dev,
1067                               const char *name)
1068 {
1069         BUG_ON(!net);
1070
1071         if (!dev_valid_name(name))
1072                 return -EINVAL;
1073
1074         if (strchr(name, '%'))
1075                 return dev_alloc_name_ns(net, dev, name);
1076         else if (__dev_get_by_name(net, name))
1077                 return -EEXIST;
1078         else if (dev->name != name)
1079                 strlcpy(dev->name, name, IFNAMSIZ);
1080
1081         return 0;
1082 }
1083
1084 /**
1085  *      dev_change_name - change name of a device
1086  *      @dev: device
1087  *      @newname: name (or format string) must be at least IFNAMSIZ
1088  *
1089  *      Change name of a device, can pass format strings "eth%d".
1090  *      for wildcarding.
1091  */
1092 int dev_change_name(struct net_device *dev, const char *newname)
1093 {
1094         unsigned char old_assign_type;
1095         char oldname[IFNAMSIZ];
1096         int err = 0;
1097         int ret;
1098         struct net *net;
1099
1100         ASSERT_RTNL();
1101         BUG_ON(!dev_net(dev));
1102
1103         net = dev_net(dev);
1104         if (dev->flags & IFF_UP)
1105                 return -EBUSY;
1106
1107         write_seqcount_begin(&devnet_rename_seq);
1108
1109         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1110                 write_seqcount_end(&devnet_rename_seq);
1111                 return 0;
1112         }
1113
1114         memcpy(oldname, dev->name, IFNAMSIZ);
1115
1116         err = dev_get_valid_name(net, dev, newname);
1117         if (err < 0) {
1118                 write_seqcount_end(&devnet_rename_seq);
1119                 return err;
1120         }
1121
1122         if (oldname[0] && !strchr(oldname, '%'))
1123                 netdev_info(dev, "renamed from %s\n", oldname);
1124
1125         old_assign_type = dev->name_assign_type;
1126         dev->name_assign_type = NET_NAME_RENAMED;
1127
1128 rollback:
1129         ret = device_rename(&dev->dev, dev->name);
1130         if (ret) {
1131                 memcpy(dev->name, oldname, IFNAMSIZ);
1132                 dev->name_assign_type = old_assign_type;
1133                 write_seqcount_end(&devnet_rename_seq);
1134                 return ret;
1135         }
1136
1137         write_seqcount_end(&devnet_rename_seq);
1138
1139         netdev_adjacent_rename_links(dev, oldname);
1140
1141         write_lock_bh(&dev_base_lock);
1142         hlist_del_rcu(&dev->name_hlist);
1143         write_unlock_bh(&dev_base_lock);
1144
1145         synchronize_rcu();
1146
1147         write_lock_bh(&dev_base_lock);
1148         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1149         write_unlock_bh(&dev_base_lock);
1150
1151         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1152         ret = notifier_to_errno(ret);
1153
1154         if (ret) {
1155                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1156                 if (err >= 0) {
1157                         err = ret;
1158                         write_seqcount_begin(&devnet_rename_seq);
1159                         memcpy(dev->name, oldname, IFNAMSIZ);
1160                         memcpy(oldname, newname, IFNAMSIZ);
1161                         dev->name_assign_type = old_assign_type;
1162                         old_assign_type = NET_NAME_RENAMED;
1163                         goto rollback;
1164                 } else {
1165                         pr_err("%s: name change rollback failed: %d\n",
1166                                dev->name, ret);
1167                 }
1168         }
1169
1170         return err;
1171 }
1172
1173 /**
1174  *      dev_set_alias - change ifalias of a device
1175  *      @dev: device
1176  *      @alias: name up to IFALIASZ
1177  *      @len: limit of bytes to copy from info
1178  *
1179  *      Set ifalias for a device,
1180  */
1181 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1182 {
1183         char *new_ifalias;
1184
1185         ASSERT_RTNL();
1186
1187         if (len >= IFALIASZ)
1188                 return -EINVAL;
1189
1190         if (!len) {
1191                 kfree(dev->ifalias);
1192                 dev->ifalias = NULL;
1193                 return 0;
1194         }
1195
1196         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1197         if (!new_ifalias)
1198                 return -ENOMEM;
1199         dev->ifalias = new_ifalias;
1200
1201         strlcpy(dev->ifalias, alias, len+1);
1202         return len;
1203 }
1204
1205
1206 /**
1207  *      netdev_features_change - device changes features
1208  *      @dev: device to cause notification
1209  *
1210  *      Called to indicate a device has changed features.
1211  */
1212 void netdev_features_change(struct net_device *dev)
1213 {
1214         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1215 }
1216 EXPORT_SYMBOL(netdev_features_change);
1217
1218 /**
1219  *      netdev_state_change - device changes state
1220  *      @dev: device to cause notification
1221  *
1222  *      Called to indicate a device has changed state. This function calls
1223  *      the notifier chains for netdev_chain and sends a NEWLINK message
1224  *      to the routing socket.
1225  */
1226 void netdev_state_change(struct net_device *dev)
1227 {
1228         if (dev->flags & IFF_UP) {
1229                 struct netdev_notifier_change_info change_info;
1230
1231                 change_info.flags_changed = 0;
1232                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1233                                               &change_info.info);
1234                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1235         }
1236 }
1237 EXPORT_SYMBOL(netdev_state_change);
1238
1239 /**
1240  *      netdev_notify_peers - notify network peers about existence of @dev
1241  *      @dev: network device
1242  *
1243  * Generate traffic such that interested network peers are aware of
1244  * @dev, such as by generating a gratuitous ARP. This may be used when
1245  * a device wants to inform the rest of the network about some sort of
1246  * reconfiguration such as a failover event or virtual machine
1247  * migration.
1248  */
1249 void netdev_notify_peers(struct net_device *dev)
1250 {
1251         rtnl_lock();
1252         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1253         rtnl_unlock();
1254 }
1255 EXPORT_SYMBOL(netdev_notify_peers);
1256
1257 static int __dev_open(struct net_device *dev)
1258 {
1259         const struct net_device_ops *ops = dev->netdev_ops;
1260         int ret;
1261
1262         ASSERT_RTNL();
1263
1264         if (!netif_device_present(dev))
1265                 return -ENODEV;
1266
1267         /* Block netpoll from trying to do any rx path servicing.
1268          * If we don't do this there is a chance ndo_poll_controller
1269          * or ndo_poll may be running while we open the device
1270          */
1271         netpoll_poll_disable(dev);
1272
1273         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1274         ret = notifier_to_errno(ret);
1275         if (ret)
1276                 return ret;
1277
1278         set_bit(__LINK_STATE_START, &dev->state);
1279
1280         if (ops->ndo_validate_addr)
1281                 ret = ops->ndo_validate_addr(dev);
1282
1283         if (!ret && ops->ndo_open)
1284                 ret = ops->ndo_open(dev);
1285
1286         netpoll_poll_enable(dev);
1287
1288         if (ret)
1289                 clear_bit(__LINK_STATE_START, &dev->state);
1290         else {
1291                 dev->flags |= IFF_UP;
1292                 dev_set_rx_mode(dev);
1293                 dev_activate(dev);
1294                 add_device_randomness(dev->dev_addr, dev->addr_len);
1295         }
1296
1297         return ret;
1298 }
1299
1300 /**
1301  *      dev_open        - prepare an interface for use.
1302  *      @dev:   device to open
1303  *
1304  *      Takes a device from down to up state. The device's private open
1305  *      function is invoked and then the multicast lists are loaded. Finally
1306  *      the device is moved into the up state and a %NETDEV_UP message is
1307  *      sent to the netdev notifier chain.
1308  *
1309  *      Calling this function on an active interface is a nop. On a failure
1310  *      a negative errno code is returned.
1311  */
1312 int dev_open(struct net_device *dev)
1313 {
1314         int ret;
1315
1316         if (dev->flags & IFF_UP)
1317                 return 0;
1318
1319         ret = __dev_open(dev);
1320         if (ret < 0)
1321                 return ret;
1322
1323         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1324         call_netdevice_notifiers(NETDEV_UP, dev);
1325
1326         return ret;
1327 }
1328 EXPORT_SYMBOL(dev_open);
1329
1330 static int __dev_close_many(struct list_head *head)
1331 {
1332         struct net_device *dev;
1333
1334         ASSERT_RTNL();
1335         might_sleep();
1336
1337         list_for_each_entry(dev, head, close_list) {
1338                 /* Temporarily disable netpoll until the interface is down */
1339                 netpoll_poll_disable(dev);
1340
1341                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1342
1343                 clear_bit(__LINK_STATE_START, &dev->state);
1344
1345                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1346                  * can be even on different cpu. So just clear netif_running().
1347                  *
1348                  * dev->stop() will invoke napi_disable() on all of it's
1349                  * napi_struct instances on this device.
1350                  */
1351                 smp_mb__after_atomic(); /* Commit netif_running(). */
1352         }
1353
1354         dev_deactivate_many(head);
1355
1356         list_for_each_entry(dev, head, close_list) {
1357                 const struct net_device_ops *ops = dev->netdev_ops;
1358
1359                 /*
1360                  *      Call the device specific close. This cannot fail.
1361                  *      Only if device is UP
1362                  *
1363                  *      We allow it to be called even after a DETACH hot-plug
1364                  *      event.
1365                  */
1366                 if (ops->ndo_stop)
1367                         ops->ndo_stop(dev);
1368
1369                 dev->flags &= ~IFF_UP;
1370                 netpoll_poll_enable(dev);
1371         }
1372
1373         return 0;
1374 }
1375
1376 static int __dev_close(struct net_device *dev)
1377 {
1378         int retval;
1379         LIST_HEAD(single);
1380
1381         list_add(&dev->close_list, &single);
1382         retval = __dev_close_many(&single);
1383         list_del(&single);
1384
1385         return retval;
1386 }
1387
1388 static int dev_close_many(struct list_head *head)
1389 {
1390         struct net_device *dev, *tmp;
1391
1392         /* Remove the devices that don't need to be closed */
1393         list_for_each_entry_safe(dev, tmp, head, close_list)
1394                 if (!(dev->flags & IFF_UP))
1395                         list_del_init(&dev->close_list);
1396
1397         __dev_close_many(head);
1398
1399         list_for_each_entry_safe(dev, tmp, head, close_list) {
1400                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1401                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1402                 list_del_init(&dev->close_list);
1403         }
1404
1405         return 0;
1406 }
1407
1408 /**
1409  *      dev_close - shutdown an interface.
1410  *      @dev: device to shutdown
1411  *
1412  *      This function moves an active device into down state. A
1413  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1414  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1415  *      chain.
1416  */
1417 int dev_close(struct net_device *dev)
1418 {
1419         if (dev->flags & IFF_UP) {
1420                 LIST_HEAD(single);
1421
1422                 list_add(&dev->close_list, &single);
1423                 dev_close_many(&single);
1424                 list_del(&single);
1425         }
1426         return 0;
1427 }
1428 EXPORT_SYMBOL(dev_close);
1429
1430
1431 /**
1432  *      dev_disable_lro - disable Large Receive Offload on a device
1433  *      @dev: device
1434  *
1435  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1436  *      called under RTNL.  This is needed if received packets may be
1437  *      forwarded to another interface.
1438  */
1439 void dev_disable_lro(struct net_device *dev)
1440 {
1441         struct net_device *lower_dev;
1442         struct list_head *iter;
1443
1444         dev->wanted_features &= ~NETIF_F_LRO;
1445         netdev_update_features(dev);
1446
1447         if (unlikely(dev->features & NETIF_F_LRO))
1448                 netdev_WARN(dev, "failed to disable LRO!\n");
1449
1450         netdev_for_each_lower_dev(dev, lower_dev, iter)
1451                 dev_disable_lro(lower_dev);
1452 }
1453 EXPORT_SYMBOL(dev_disable_lro);
1454
1455 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1456                                    struct net_device *dev)
1457 {
1458         struct netdev_notifier_info info;
1459
1460         netdev_notifier_info_init(&info, dev);
1461         return nb->notifier_call(nb, val, &info);
1462 }
1463
1464 static int dev_boot_phase = 1;
1465
1466 /**
1467  *      register_netdevice_notifier - register a network notifier block
1468  *      @nb: notifier
1469  *
1470  *      Register a notifier to be called when network device events occur.
1471  *      The notifier passed is linked into the kernel structures and must
1472  *      not be reused until it has been unregistered. A negative errno code
1473  *      is returned on a failure.
1474  *
1475  *      When registered all registration and up events are replayed
1476  *      to the new notifier to allow device to have a race free
1477  *      view of the network device list.
1478  */
1479
1480 int register_netdevice_notifier(struct notifier_block *nb)
1481 {
1482         struct net_device *dev;
1483         struct net_device *last;
1484         struct net *net;
1485         int err;
1486
1487         rtnl_lock();
1488         err = raw_notifier_chain_register(&netdev_chain, nb);
1489         if (err)
1490                 goto unlock;
1491         if (dev_boot_phase)
1492                 goto unlock;
1493         for_each_net(net) {
1494                 for_each_netdev(net, dev) {
1495                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1496                         err = notifier_to_errno(err);
1497                         if (err)
1498                                 goto rollback;
1499
1500                         if (!(dev->flags & IFF_UP))
1501                                 continue;
1502
1503                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1504                 }
1505         }
1506
1507 unlock:
1508         rtnl_unlock();
1509         return err;
1510
1511 rollback:
1512         last = dev;
1513         for_each_net(net) {
1514                 for_each_netdev(net, dev) {
1515                         if (dev == last)
1516                                 goto outroll;
1517
1518                         if (dev->flags & IFF_UP) {
1519                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1520                                                         dev);
1521                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1522                         }
1523                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1524                 }
1525         }
1526
1527 outroll:
1528         raw_notifier_chain_unregister(&netdev_chain, nb);
1529         goto unlock;
1530 }
1531 EXPORT_SYMBOL(register_netdevice_notifier);
1532
1533 /**
1534  *      unregister_netdevice_notifier - unregister a network notifier block
1535  *      @nb: notifier
1536  *
1537  *      Unregister a notifier previously registered by
1538  *      register_netdevice_notifier(). The notifier is unlinked into the
1539  *      kernel structures and may then be reused. A negative errno code
1540  *      is returned on a failure.
1541  *
1542  *      After unregistering unregister and down device events are synthesized
1543  *      for all devices on the device list to the removed notifier to remove
1544  *      the need for special case cleanup code.
1545  */
1546
1547 int unregister_netdevice_notifier(struct notifier_block *nb)
1548 {
1549         struct net_device *dev;
1550         struct net *net;
1551         int err;
1552
1553         rtnl_lock();
1554         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1555         if (err)
1556                 goto unlock;
1557
1558         for_each_net(net) {
1559                 for_each_netdev(net, dev) {
1560                         if (dev->flags & IFF_UP) {
1561                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1562                                                         dev);
1563                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1564                         }
1565                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1566                 }
1567         }
1568 unlock:
1569         rtnl_unlock();
1570         return err;
1571 }
1572 EXPORT_SYMBOL(unregister_netdevice_notifier);
1573
1574 /**
1575  *      call_netdevice_notifiers_info - call all network notifier blocks
1576  *      @val: value passed unmodified to notifier function
1577  *      @dev: net_device pointer passed unmodified to notifier function
1578  *      @info: notifier information data
1579  *
1580  *      Call all network notifier blocks.  Parameters and return value
1581  *      are as for raw_notifier_call_chain().
1582  */
1583
1584 static int call_netdevice_notifiers_info(unsigned long val,
1585                                          struct net_device *dev,
1586                                          struct netdev_notifier_info *info)
1587 {
1588         ASSERT_RTNL();
1589         netdev_notifier_info_init(info, dev);
1590         return raw_notifier_call_chain(&netdev_chain, val, info);
1591 }
1592
1593 /**
1594  *      call_netdevice_notifiers - call all network notifier blocks
1595  *      @val: value passed unmodified to notifier function
1596  *      @dev: net_device pointer passed unmodified to notifier function
1597  *
1598  *      Call all network notifier blocks.  Parameters and return value
1599  *      are as for raw_notifier_call_chain().
1600  */
1601
1602 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1603 {
1604         struct netdev_notifier_info info;
1605
1606         return call_netdevice_notifiers_info(val, dev, &info);
1607 }
1608 EXPORT_SYMBOL(call_netdevice_notifiers);
1609
1610 static struct static_key netstamp_needed __read_mostly;
1611 #ifdef HAVE_JUMP_LABEL
1612 /* We are not allowed to call static_key_slow_dec() from irq context
1613  * If net_disable_timestamp() is called from irq context, defer the
1614  * static_key_slow_dec() calls.
1615  */
1616 static atomic_t netstamp_needed_deferred;
1617 #endif
1618
1619 void net_enable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1623
1624         if (deferred) {
1625                 while (--deferred)
1626                         static_key_slow_dec(&netstamp_needed);
1627                 return;
1628         }
1629 #endif
1630         static_key_slow_inc(&netstamp_needed);
1631 }
1632 EXPORT_SYMBOL(net_enable_timestamp);
1633
1634 void net_disable_timestamp(void)
1635 {
1636 #ifdef HAVE_JUMP_LABEL
1637         if (in_interrupt()) {
1638                 atomic_inc(&netstamp_needed_deferred);
1639                 return;
1640         }
1641 #endif
1642         static_key_slow_dec(&netstamp_needed);
1643 }
1644 EXPORT_SYMBOL(net_disable_timestamp);
1645
1646 static inline void net_timestamp_set(struct sk_buff *skb)
1647 {
1648         skb->tstamp.tv64 = 0;
1649         if (static_key_false(&netstamp_needed))
1650                 __net_timestamp(skb);
1651 }
1652
1653 #define net_timestamp_check(COND, SKB)                  \
1654         if (static_key_false(&netstamp_needed)) {               \
1655                 if ((COND) && !(SKB)->tstamp.tv64)      \
1656                         __net_timestamp(SKB);           \
1657         }                                               \
1658
1659 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1660 {
1661         unsigned int len;
1662
1663         if (!(dev->flags & IFF_UP))
1664                 return false;
1665
1666         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1667         if (skb->len <= len)
1668                 return true;
1669
1670         /* if TSO is enabled, we don't care about the length as the packet
1671          * could be forwarded without being segmented before
1672          */
1673         if (skb_is_gso(skb))
1674                 return true;
1675
1676         return false;
1677 }
1678 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1679
1680 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1681 {
1682         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1683                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1684                         atomic_long_inc(&dev->rx_dropped);
1685                         kfree_skb(skb);
1686                         return NET_RX_DROP;
1687                 }
1688         }
1689
1690         if (unlikely(!is_skb_forwardable(dev, skb))) {
1691                 atomic_long_inc(&dev->rx_dropped);
1692                 kfree_skb(skb);
1693                 return NET_RX_DROP;
1694         }
1695
1696         skb_scrub_packet(skb, true);
1697         skb->protocol = eth_type_trans(skb, dev);
1698         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1699
1700         return 0;
1701 }
1702 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1703
1704 /**
1705  * dev_forward_skb - loopback an skb to another netif
1706  *
1707  * @dev: destination network device
1708  * @skb: buffer to forward
1709  *
1710  * return values:
1711  *      NET_RX_SUCCESS  (no congestion)
1712  *      NET_RX_DROP     (packet was dropped, but freed)
1713  *
1714  * dev_forward_skb can be used for injecting an skb from the
1715  * start_xmit function of one device into the receive queue
1716  * of another device.
1717  *
1718  * The receiving device may be in another namespace, so
1719  * we have to clear all information in the skb that could
1720  * impact namespace isolation.
1721  */
1722 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1723 {
1724         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1725 }
1726 EXPORT_SYMBOL_GPL(dev_forward_skb);
1727
1728 static inline int deliver_skb(struct sk_buff *skb,
1729                               struct packet_type *pt_prev,
1730                               struct net_device *orig_dev)
1731 {
1732         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1733                 return -ENOMEM;
1734         atomic_inc(&skb->users);
1735         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1736 }
1737
1738 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1739                                           struct packet_type **pt,
1740                                           struct net_device *dev, __be16 type,
1741                                           struct list_head *ptype_list)
1742 {
1743         struct packet_type *ptype, *pt_prev = *pt;
1744
1745         list_for_each_entry_rcu(ptype, ptype_list, list) {
1746                 if (ptype->type != type)
1747                         continue;
1748                 if (pt_prev)
1749                         deliver_skb(skb, pt_prev, dev);
1750                 pt_prev = ptype;
1751         }
1752         *pt = pt_prev;
1753 }
1754
1755 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1756 {
1757         if (!ptype->af_packet_priv || !skb->sk)
1758                 return false;
1759
1760         if (ptype->id_match)
1761                 return ptype->id_match(ptype, skb->sk);
1762         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1763                 return true;
1764
1765         return false;
1766 }
1767
1768 /*
1769  *      Support routine. Sends outgoing frames to any network
1770  *      taps currently in use.
1771  */
1772
1773 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1774 {
1775         struct packet_type *ptype;
1776         struct sk_buff *skb2 = NULL;
1777         struct packet_type *pt_prev = NULL;
1778         struct list_head *ptype_list = &ptype_all;
1779
1780         rcu_read_lock();
1781 again:
1782         list_for_each_entry_rcu(ptype, ptype_list, list) {
1783                 /* Never send packets back to the socket
1784                  * they originated from - MvS (miquels@drinkel.ow.org)
1785                  */
1786                 if (skb_loop_sk(ptype, skb))
1787                         continue;
1788
1789                 if (pt_prev) {
1790                         deliver_skb(skb2, pt_prev, skb->dev);
1791                         pt_prev = ptype;
1792                         continue;
1793                 }
1794
1795                 /* need to clone skb, done only once */
1796                 skb2 = skb_clone(skb, GFP_ATOMIC);
1797                 if (!skb2)
1798                         goto out_unlock;
1799
1800                 net_timestamp_set(skb2);
1801
1802                 /* skb->nh should be correctly
1803                  * set by sender, so that the second statement is
1804                  * just protection against buggy protocols.
1805                  */
1806                 skb_reset_mac_header(skb2);
1807
1808                 if (skb_network_header(skb2) < skb2->data ||
1809                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1810                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1811                                              ntohs(skb2->protocol),
1812                                              dev->name);
1813                         skb_reset_network_header(skb2);
1814                 }
1815
1816                 skb2->transport_header = skb2->network_header;
1817                 skb2->pkt_type = PACKET_OUTGOING;
1818                 pt_prev = ptype;
1819         }
1820
1821         if (ptype_list == &ptype_all) {
1822                 ptype_list = &dev->ptype_all;
1823                 goto again;
1824         }
1825 out_unlock:
1826         if (pt_prev)
1827                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1828         rcu_read_unlock();
1829 }
1830
1831 /**
1832  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1833  * @dev: Network device
1834  * @txq: number of queues available
1835  *
1836  * If real_num_tx_queues is changed the tc mappings may no longer be
1837  * valid. To resolve this verify the tc mapping remains valid and if
1838  * not NULL the mapping. With no priorities mapping to this
1839  * offset/count pair it will no longer be used. In the worst case TC0
1840  * is invalid nothing can be done so disable priority mappings. If is
1841  * expected that drivers will fix this mapping if they can before
1842  * calling netif_set_real_num_tx_queues.
1843  */
1844 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1845 {
1846         int i;
1847         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1848
1849         /* If TC0 is invalidated disable TC mapping */
1850         if (tc->offset + tc->count > txq) {
1851                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1852                 dev->num_tc = 0;
1853                 return;
1854         }
1855
1856         /* Invalidated prio to tc mappings set to TC0 */
1857         for (i = 1; i < TC_BITMASK + 1; i++) {
1858                 int q = netdev_get_prio_tc_map(dev, i);
1859
1860                 tc = &dev->tc_to_txq[q];
1861                 if (tc->offset + tc->count > txq) {
1862                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1863                                 i, q);
1864                         netdev_set_prio_tc_map(dev, i, 0);
1865                 }
1866         }
1867 }
1868
1869 #ifdef CONFIG_XPS
1870 static DEFINE_MUTEX(xps_map_mutex);
1871 #define xmap_dereference(P)             \
1872         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1873
1874 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1875                                         int cpu, u16 index)
1876 {
1877         struct xps_map *map = NULL;
1878         int pos;
1879
1880         if (dev_maps)
1881                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1882
1883         for (pos = 0; map && pos < map->len; pos++) {
1884                 if (map->queues[pos] == index) {
1885                         if (map->len > 1) {
1886                                 map->queues[pos] = map->queues[--map->len];
1887                         } else {
1888                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1889                                 kfree_rcu(map, rcu);
1890                                 map = NULL;
1891                         }
1892                         break;
1893                 }
1894         }
1895
1896         return map;
1897 }
1898
1899 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1900 {
1901         struct xps_dev_maps *dev_maps;
1902         int cpu, i;
1903         bool active = false;
1904
1905         mutex_lock(&xps_map_mutex);
1906         dev_maps = xmap_dereference(dev->xps_maps);
1907
1908         if (!dev_maps)
1909                 goto out_no_maps;
1910
1911         for_each_possible_cpu(cpu) {
1912                 for (i = index; i < dev->num_tx_queues; i++) {
1913                         if (!remove_xps_queue(dev_maps, cpu, i))
1914                                 break;
1915                 }
1916                 if (i == dev->num_tx_queues)
1917                         active = true;
1918         }
1919
1920         if (!active) {
1921                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1922                 kfree_rcu(dev_maps, rcu);
1923         }
1924
1925         for (i = index; i < dev->num_tx_queues; i++)
1926                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1927                                              NUMA_NO_NODE);
1928
1929 out_no_maps:
1930         mutex_unlock(&xps_map_mutex);
1931 }
1932
1933 static struct xps_map *expand_xps_map(struct xps_map *map,
1934                                       int cpu, u16 index)
1935 {
1936         struct xps_map *new_map;
1937         int alloc_len = XPS_MIN_MAP_ALLOC;
1938         int i, pos;
1939
1940         for (pos = 0; map && pos < map->len; pos++) {
1941                 if (map->queues[pos] != index)
1942                         continue;
1943                 return map;
1944         }
1945
1946         /* Need to add queue to this CPU's existing map */
1947         if (map) {
1948                 if (pos < map->alloc_len)
1949                         return map;
1950
1951                 alloc_len = map->alloc_len * 2;
1952         }
1953
1954         /* Need to allocate new map to store queue on this CPU's map */
1955         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1956                                cpu_to_node(cpu));
1957         if (!new_map)
1958                 return NULL;
1959
1960         for (i = 0; i < pos; i++)
1961                 new_map->queues[i] = map->queues[i];
1962         new_map->alloc_len = alloc_len;
1963         new_map->len = pos;
1964
1965         return new_map;
1966 }
1967
1968 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1969                         u16 index)
1970 {
1971         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1972         struct xps_map *map, *new_map;
1973         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1974         int cpu, numa_node_id = -2;
1975         bool active = false;
1976
1977         mutex_lock(&xps_map_mutex);
1978
1979         dev_maps = xmap_dereference(dev->xps_maps);
1980
1981         /* allocate memory for queue storage */
1982         for_each_online_cpu(cpu) {
1983                 if (!cpumask_test_cpu(cpu, mask))
1984                         continue;
1985
1986                 if (!new_dev_maps)
1987                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1988                 if (!new_dev_maps) {
1989                         mutex_unlock(&xps_map_mutex);
1990                         return -ENOMEM;
1991                 }
1992
1993                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1994                                  NULL;
1995
1996                 map = expand_xps_map(map, cpu, index);
1997                 if (!map)
1998                         goto error;
1999
2000                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2001         }
2002
2003         if (!new_dev_maps)
2004                 goto out_no_new_maps;
2005
2006         for_each_possible_cpu(cpu) {
2007                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2008                         /* add queue to CPU maps */
2009                         int pos = 0;
2010
2011                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2012                         while ((pos < map->len) && (map->queues[pos] != index))
2013                                 pos++;
2014
2015                         if (pos == map->len)
2016                                 map->queues[map->len++] = index;
2017 #ifdef CONFIG_NUMA
2018                         if (numa_node_id == -2)
2019                                 numa_node_id = cpu_to_node(cpu);
2020                         else if (numa_node_id != cpu_to_node(cpu))
2021                                 numa_node_id = -1;
2022 #endif
2023                 } else if (dev_maps) {
2024                         /* fill in the new device map from the old device map */
2025                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2026                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2027                 }
2028
2029         }
2030
2031         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2032
2033         /* Cleanup old maps */
2034         if (dev_maps) {
2035                 for_each_possible_cpu(cpu) {
2036                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2037                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2038                         if (map && map != new_map)
2039                                 kfree_rcu(map, rcu);
2040                 }
2041
2042                 kfree_rcu(dev_maps, rcu);
2043         }
2044
2045         dev_maps = new_dev_maps;
2046         active = true;
2047
2048 out_no_new_maps:
2049         /* update Tx queue numa node */
2050         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2051                                      (numa_node_id >= 0) ? numa_node_id :
2052                                      NUMA_NO_NODE);
2053
2054         if (!dev_maps)
2055                 goto out_no_maps;
2056
2057         /* removes queue from unused CPUs */
2058         for_each_possible_cpu(cpu) {
2059                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2060                         continue;
2061
2062                 if (remove_xps_queue(dev_maps, cpu, index))
2063                         active = true;
2064         }
2065
2066         /* free map if not active */
2067         if (!active) {
2068                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2069                 kfree_rcu(dev_maps, rcu);
2070         }
2071
2072 out_no_maps:
2073         mutex_unlock(&xps_map_mutex);
2074
2075         return 0;
2076 error:
2077         /* remove any maps that we added */
2078         for_each_possible_cpu(cpu) {
2079                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2080                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2081                                  NULL;
2082                 if (new_map && new_map != map)
2083                         kfree(new_map);
2084         }
2085
2086         mutex_unlock(&xps_map_mutex);
2087
2088         kfree(new_dev_maps);
2089         return -ENOMEM;
2090 }
2091 EXPORT_SYMBOL(netif_set_xps_queue);
2092
2093 #endif
2094 /*
2095  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2096  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2097  */
2098 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2099 {
2100         int rc;
2101
2102         if (txq < 1 || txq > dev->num_tx_queues)
2103                 return -EINVAL;
2104
2105         if (dev->reg_state == NETREG_REGISTERED ||
2106             dev->reg_state == NETREG_UNREGISTERING) {
2107                 ASSERT_RTNL();
2108
2109                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2110                                                   txq);
2111                 if (rc)
2112                         return rc;
2113
2114                 if (dev->num_tc)
2115                         netif_setup_tc(dev, txq);
2116
2117                 if (txq < dev->real_num_tx_queues) {
2118                         qdisc_reset_all_tx_gt(dev, txq);
2119 #ifdef CONFIG_XPS
2120                         netif_reset_xps_queues_gt(dev, txq);
2121 #endif
2122                 }
2123         }
2124
2125         dev->real_num_tx_queues = txq;
2126         return 0;
2127 }
2128 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2129
2130 #ifdef CONFIG_SYSFS
2131 /**
2132  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2133  *      @dev: Network device
2134  *      @rxq: Actual number of RX queues
2135  *
2136  *      This must be called either with the rtnl_lock held or before
2137  *      registration of the net device.  Returns 0 on success, or a
2138  *      negative error code.  If called before registration, it always
2139  *      succeeds.
2140  */
2141 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2142 {
2143         int rc;
2144
2145         if (rxq < 1 || rxq > dev->num_rx_queues)
2146                 return -EINVAL;
2147
2148         if (dev->reg_state == NETREG_REGISTERED) {
2149                 ASSERT_RTNL();
2150
2151                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2152                                                   rxq);
2153                 if (rc)
2154                         return rc;
2155         }
2156
2157         dev->real_num_rx_queues = rxq;
2158         return 0;
2159 }
2160 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2161 #endif
2162
2163 /**
2164  * netif_get_num_default_rss_queues - default number of RSS queues
2165  *
2166  * This routine should set an upper limit on the number of RSS queues
2167  * used by default by multiqueue devices.
2168  */
2169 int netif_get_num_default_rss_queues(void)
2170 {
2171         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2172 }
2173 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2174
2175 static inline void __netif_reschedule(struct Qdisc *q)
2176 {
2177         struct softnet_data *sd;
2178         unsigned long flags;
2179
2180         local_irq_save(flags);
2181         sd = this_cpu_ptr(&softnet_data);
2182         q->next_sched = NULL;
2183         *sd->output_queue_tailp = q;
2184         sd->output_queue_tailp = &q->next_sched;
2185         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2186         local_irq_restore(flags);
2187 }
2188
2189 void __netif_schedule(struct Qdisc *q)
2190 {
2191         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2192                 __netif_reschedule(q);
2193 }
2194 EXPORT_SYMBOL(__netif_schedule);
2195
2196 struct dev_kfree_skb_cb {
2197         enum skb_free_reason reason;
2198 };
2199
2200 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2201 {
2202         return (struct dev_kfree_skb_cb *)skb->cb;
2203 }
2204
2205 void netif_schedule_queue(struct netdev_queue *txq)
2206 {
2207         rcu_read_lock();
2208         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2209                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2210
2211                 __netif_schedule(q);
2212         }
2213         rcu_read_unlock();
2214 }
2215 EXPORT_SYMBOL(netif_schedule_queue);
2216
2217 /**
2218  *      netif_wake_subqueue - allow sending packets on subqueue
2219  *      @dev: network device
2220  *      @queue_index: sub queue index
2221  *
2222  * Resume individual transmit queue of a device with multiple transmit queues.
2223  */
2224 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2225 {
2226         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2227
2228         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2229                 struct Qdisc *q;
2230
2231                 rcu_read_lock();
2232                 q = rcu_dereference(txq->qdisc);
2233                 __netif_schedule(q);
2234                 rcu_read_unlock();
2235         }
2236 }
2237 EXPORT_SYMBOL(netif_wake_subqueue);
2238
2239 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2240 {
2241         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2242                 struct Qdisc *q;
2243
2244                 rcu_read_lock();
2245                 q = rcu_dereference(dev_queue->qdisc);
2246                 __netif_schedule(q);
2247                 rcu_read_unlock();
2248         }
2249 }
2250 EXPORT_SYMBOL(netif_tx_wake_queue);
2251
2252 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2253 {
2254         unsigned long flags;
2255
2256         if (likely(atomic_read(&skb->users) == 1)) {
2257                 smp_rmb();
2258                 atomic_set(&skb->users, 0);
2259         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2260                 return;
2261         }
2262         get_kfree_skb_cb(skb)->reason = reason;
2263         local_irq_save(flags);
2264         skb->next = __this_cpu_read(softnet_data.completion_queue);
2265         __this_cpu_write(softnet_data.completion_queue, skb);
2266         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2267         local_irq_restore(flags);
2268 }
2269 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2270
2271 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2272 {
2273         if (in_irq() || irqs_disabled())
2274                 __dev_kfree_skb_irq(skb, reason);
2275         else
2276                 dev_kfree_skb(skb);
2277 }
2278 EXPORT_SYMBOL(__dev_kfree_skb_any);
2279
2280
2281 /**
2282  * netif_device_detach - mark device as removed
2283  * @dev: network device
2284  *
2285  * Mark device as removed from system and therefore no longer available.
2286  */
2287 void netif_device_detach(struct net_device *dev)
2288 {
2289         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2290             netif_running(dev)) {
2291                 netif_tx_stop_all_queues(dev);
2292         }
2293 }
2294 EXPORT_SYMBOL(netif_device_detach);
2295
2296 /**
2297  * netif_device_attach - mark device as attached
2298  * @dev: network device
2299  *
2300  * Mark device as attached from system and restart if needed.
2301  */
2302 void netif_device_attach(struct net_device *dev)
2303 {
2304         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2305             netif_running(dev)) {
2306                 netif_tx_wake_all_queues(dev);
2307                 __netdev_watchdog_up(dev);
2308         }
2309 }
2310 EXPORT_SYMBOL(netif_device_attach);
2311
2312 static void skb_warn_bad_offload(const struct sk_buff *skb)
2313 {
2314         static const netdev_features_t null_features = 0;
2315         struct net_device *dev = skb->dev;
2316         const char *driver = "";
2317
2318         if (!net_ratelimit())
2319                 return;
2320
2321         if (dev && dev->dev.parent)
2322                 driver = dev_driver_string(dev->dev.parent);
2323
2324         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2325              "gso_type=%d ip_summed=%d\n",
2326              driver, dev ? &dev->features : &null_features,
2327              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2328              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2329              skb_shinfo(skb)->gso_type, skb->ip_summed);
2330 }
2331
2332 /*
2333  * Invalidate hardware checksum when packet is to be mangled, and
2334  * complete checksum manually on outgoing path.
2335  */
2336 int skb_checksum_help(struct sk_buff *skb)
2337 {
2338         __wsum csum;
2339         int ret = 0, offset;
2340
2341         if (skb->ip_summed == CHECKSUM_COMPLETE)
2342                 goto out_set_summed;
2343
2344         if (unlikely(skb_shinfo(skb)->gso_size)) {
2345                 skb_warn_bad_offload(skb);
2346                 return -EINVAL;
2347         }
2348
2349         /* Before computing a checksum, we should make sure no frag could
2350          * be modified by an external entity : checksum could be wrong.
2351          */
2352         if (skb_has_shared_frag(skb)) {
2353                 ret = __skb_linearize(skb);
2354                 if (ret)
2355                         goto out;
2356         }
2357
2358         offset = skb_checksum_start_offset(skb);
2359         BUG_ON(offset >= skb_headlen(skb));
2360         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2361
2362         offset += skb->csum_offset;
2363         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2364
2365         if (skb_cloned(skb) &&
2366             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2367                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2368                 if (ret)
2369                         goto out;
2370         }
2371
2372         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2373 out_set_summed:
2374         skb->ip_summed = CHECKSUM_NONE;
2375 out:
2376         return ret;
2377 }
2378 EXPORT_SYMBOL(skb_checksum_help);
2379
2380 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2381 {
2382         unsigned int vlan_depth = skb->mac_len;
2383         __be16 type = skb->protocol;
2384
2385         /* Tunnel gso handlers can set protocol to ethernet. */
2386         if (type == htons(ETH_P_TEB)) {
2387                 struct ethhdr *eth;
2388
2389                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2390                         return 0;
2391
2392                 eth = (struct ethhdr *)skb_mac_header(skb);
2393                 type = eth->h_proto;
2394         }
2395
2396         /* if skb->protocol is 802.1Q/AD then the header should already be
2397          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2398          * ETH_HLEN otherwise
2399          */
2400         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2401                 if (vlan_depth) {
2402                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2403                                 return 0;
2404                         vlan_depth -= VLAN_HLEN;
2405                 } else {
2406                         vlan_depth = ETH_HLEN;
2407                 }
2408                 do {
2409                         struct vlan_hdr *vh;
2410
2411                         if (unlikely(!pskb_may_pull(skb,
2412                                                     vlan_depth + VLAN_HLEN)))
2413                                 return 0;
2414
2415                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2416                         type = vh->h_vlan_encapsulated_proto;
2417                         vlan_depth += VLAN_HLEN;
2418                 } while (type == htons(ETH_P_8021Q) ||
2419                          type == htons(ETH_P_8021AD));
2420         }
2421
2422         *depth = vlan_depth;
2423
2424         return type;
2425 }
2426
2427 /**
2428  *      skb_mac_gso_segment - mac layer segmentation handler.
2429  *      @skb: buffer to segment
2430  *      @features: features for the output path (see dev->features)
2431  */
2432 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2433                                     netdev_features_t features)
2434 {
2435         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2436         struct packet_offload *ptype;
2437         int vlan_depth = skb->mac_len;
2438         __be16 type = skb_network_protocol(skb, &vlan_depth);
2439
2440         if (unlikely(!type))
2441                 return ERR_PTR(-EINVAL);
2442
2443         __skb_pull(skb, vlan_depth);
2444
2445         rcu_read_lock();
2446         list_for_each_entry_rcu(ptype, &offload_base, list) {
2447                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2448                         segs = ptype->callbacks.gso_segment(skb, features);
2449                         break;
2450                 }
2451         }
2452         rcu_read_unlock();
2453
2454         __skb_push(skb, skb->data - skb_mac_header(skb));
2455
2456         return segs;
2457 }
2458 EXPORT_SYMBOL(skb_mac_gso_segment);
2459
2460
2461 /* openvswitch calls this on rx path, so we need a different check.
2462  */
2463 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2464 {
2465         if (tx_path)
2466                 return skb->ip_summed != CHECKSUM_PARTIAL;
2467         else
2468                 return skb->ip_summed == CHECKSUM_NONE;
2469 }
2470
2471 /**
2472  *      __skb_gso_segment - Perform segmentation on skb.
2473  *      @skb: buffer to segment
2474  *      @features: features for the output path (see dev->features)
2475  *      @tx_path: whether it is called in TX path
2476  *
2477  *      This function segments the given skb and returns a list of segments.
2478  *
2479  *      It may return NULL if the skb requires no segmentation.  This is
2480  *      only possible when GSO is used for verifying header integrity.
2481  */
2482 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2483                                   netdev_features_t features, bool tx_path)
2484 {
2485         if (unlikely(skb_needs_check(skb, tx_path))) {
2486                 int err;
2487
2488                 skb_warn_bad_offload(skb);
2489
2490                 err = skb_cow_head(skb, 0);
2491                 if (err < 0)
2492                         return ERR_PTR(err);
2493         }
2494
2495         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2496         SKB_GSO_CB(skb)->encap_level = 0;
2497
2498         skb_reset_mac_header(skb);
2499         skb_reset_mac_len(skb);
2500
2501         return skb_mac_gso_segment(skb, features);
2502 }
2503 EXPORT_SYMBOL(__skb_gso_segment);
2504
2505 /* Take action when hardware reception checksum errors are detected. */
2506 #ifdef CONFIG_BUG
2507 void netdev_rx_csum_fault(struct net_device *dev)
2508 {
2509         if (net_ratelimit()) {
2510                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2511                 dump_stack();
2512         }
2513 }
2514 EXPORT_SYMBOL(netdev_rx_csum_fault);
2515 #endif
2516
2517 /* Actually, we should eliminate this check as soon as we know, that:
2518  * 1. IOMMU is present and allows to map all the memory.
2519  * 2. No high memory really exists on this machine.
2520  */
2521
2522 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2523 {
2524 #ifdef CONFIG_HIGHMEM
2525         int i;
2526         if (!(dev->features & NETIF_F_HIGHDMA)) {
2527                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2528                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2529                         if (PageHighMem(skb_frag_page(frag)))
2530                                 return 1;
2531                 }
2532         }
2533
2534         if (PCI_DMA_BUS_IS_PHYS) {
2535                 struct device *pdev = dev->dev.parent;
2536
2537                 if (!pdev)
2538                         return 0;
2539                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2540                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2541                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2542                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2543                                 return 1;
2544                 }
2545         }
2546 #endif
2547         return 0;
2548 }
2549
2550 /* If MPLS offload request, verify we are testing hardware MPLS features
2551  * instead of standard features for the netdev.
2552  */
2553 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2554 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2555                                            netdev_features_t features,
2556                                            __be16 type)
2557 {
2558         if (eth_p_mpls(type))
2559                 features &= skb->dev->mpls_features;
2560
2561         return features;
2562 }
2563 #else
2564 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2565                                            netdev_features_t features,
2566                                            __be16 type)
2567 {
2568         return features;
2569 }
2570 #endif
2571
2572 static netdev_features_t harmonize_features(struct sk_buff *skb,
2573         netdev_features_t features)
2574 {
2575         int tmp;
2576         __be16 type;
2577
2578         type = skb_network_protocol(skb, &tmp);
2579         features = net_mpls_features(skb, features, type);
2580
2581         if (skb->ip_summed != CHECKSUM_NONE &&
2582             !can_checksum_protocol(features, type)) {
2583                 features &= ~NETIF_F_ALL_CSUM;
2584         } else if (illegal_highdma(skb->dev, skb)) {
2585                 features &= ~NETIF_F_SG;
2586         }
2587
2588         return features;
2589 }
2590
2591 netdev_features_t netif_skb_features(struct sk_buff *skb)
2592 {
2593         struct net_device *dev = skb->dev;
2594         netdev_features_t features = dev->features;
2595         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2596         __be16 protocol = skb->protocol;
2597
2598         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2599                 features &= ~NETIF_F_GSO_MASK;
2600
2601         /* If encapsulation offload request, verify we are testing
2602          * hardware encapsulation features instead of standard
2603          * features for the netdev
2604          */
2605         if (skb->encapsulation)
2606                 features &= dev->hw_enc_features;
2607
2608         if (!skb_vlan_tag_present(skb)) {
2609                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2610                              protocol == htons(ETH_P_8021AD))) {
2611                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2612                         protocol = veh->h_vlan_encapsulated_proto;
2613                 } else {
2614                         goto finalize;
2615                 }
2616         }
2617
2618         features = netdev_intersect_features(features,
2619                                              dev->vlan_features |
2620                                              NETIF_F_HW_VLAN_CTAG_TX |
2621                                              NETIF_F_HW_VLAN_STAG_TX);
2622
2623         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2624                 features = netdev_intersect_features(features,
2625                                                      NETIF_F_SG |
2626                                                      NETIF_F_HIGHDMA |
2627                                                      NETIF_F_FRAGLIST |
2628                                                      NETIF_F_GEN_CSUM |
2629                                                      NETIF_F_HW_VLAN_CTAG_TX |
2630                                                      NETIF_F_HW_VLAN_STAG_TX);
2631
2632 finalize:
2633         if (dev->netdev_ops->ndo_features_check)
2634                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2635                                                                 features);
2636
2637         return harmonize_features(skb, features);
2638 }
2639 EXPORT_SYMBOL(netif_skb_features);
2640
2641 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2642                     struct netdev_queue *txq, bool more)
2643 {
2644         unsigned int len;
2645         int rc;
2646
2647         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2648                 dev_queue_xmit_nit(skb, dev);
2649
2650         len = skb->len;
2651         trace_net_dev_start_xmit(skb, dev);
2652         rc = netdev_start_xmit(skb, dev, txq, more);
2653         trace_net_dev_xmit(skb, rc, dev, len);
2654
2655         return rc;
2656 }
2657
2658 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2659                                     struct netdev_queue *txq, int *ret)
2660 {
2661         struct sk_buff *skb = first;
2662         int rc = NETDEV_TX_OK;
2663
2664         while (skb) {
2665                 struct sk_buff *next = skb->next;
2666
2667                 skb->next = NULL;
2668                 rc = xmit_one(skb, dev, txq, next != NULL);
2669                 if (unlikely(!dev_xmit_complete(rc))) {
2670                         skb->next = next;
2671                         goto out;
2672                 }
2673
2674                 skb = next;
2675                 if (netif_xmit_stopped(txq) && skb) {
2676                         rc = NETDEV_TX_BUSY;
2677                         break;
2678                 }
2679         }
2680
2681 out:
2682         *ret = rc;
2683         return skb;
2684 }
2685
2686 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2687                                           netdev_features_t features)
2688 {
2689         if (skb_vlan_tag_present(skb) &&
2690             !vlan_hw_offload_capable(features, skb->vlan_proto))
2691                 skb = __vlan_hwaccel_push_inside(skb);
2692         return skb;
2693 }
2694
2695 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2696 {
2697         netdev_features_t features;
2698
2699         if (skb->next)
2700                 return skb;
2701
2702         features = netif_skb_features(skb);
2703         skb = validate_xmit_vlan(skb, features);
2704         if (unlikely(!skb))
2705                 goto out_null;
2706
2707         if (netif_needs_gso(dev, skb, features)) {
2708                 struct sk_buff *segs;
2709
2710                 segs = skb_gso_segment(skb, features);
2711                 if (IS_ERR(segs)) {
2712                         goto out_kfree_skb;
2713                 } else if (segs) {
2714                         consume_skb(skb);
2715                         skb = segs;
2716                 }
2717         } else {
2718                 if (skb_needs_linearize(skb, features) &&
2719                     __skb_linearize(skb))
2720                         goto out_kfree_skb;
2721
2722                 /* If packet is not checksummed and device does not
2723                  * support checksumming for this protocol, complete
2724                  * checksumming here.
2725                  */
2726                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2727                         if (skb->encapsulation)
2728                                 skb_set_inner_transport_header(skb,
2729                                                                skb_checksum_start_offset(skb));
2730                         else
2731                                 skb_set_transport_header(skb,
2732                                                          skb_checksum_start_offset(skb));
2733                         if (!(features & NETIF_F_ALL_CSUM) &&
2734                             skb_checksum_help(skb))
2735                                 goto out_kfree_skb;
2736                 }
2737         }
2738
2739         return skb;
2740
2741 out_kfree_skb:
2742         kfree_skb(skb);
2743 out_null:
2744         return NULL;
2745 }
2746
2747 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2748 {
2749         struct sk_buff *next, *head = NULL, *tail;
2750
2751         for (; skb != NULL; skb = next) {
2752                 next = skb->next;
2753                 skb->next = NULL;
2754
2755                 /* in case skb wont be segmented, point to itself */
2756                 skb->prev = skb;
2757
2758                 skb = validate_xmit_skb(skb, dev);
2759                 if (!skb)
2760                         continue;
2761
2762                 if (!head)
2763                         head = skb;
2764                 else
2765                         tail->next = skb;
2766                 /* If skb was segmented, skb->prev points to
2767                  * the last segment. If not, it still contains skb.
2768                  */
2769                 tail = skb->prev;
2770         }
2771         return head;
2772 }
2773
2774 static void qdisc_pkt_len_init(struct sk_buff *skb)
2775 {
2776         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2777
2778         qdisc_skb_cb(skb)->pkt_len = skb->len;
2779
2780         /* To get more precise estimation of bytes sent on wire,
2781          * we add to pkt_len the headers size of all segments
2782          */
2783         if (shinfo->gso_size)  {
2784                 unsigned int hdr_len;
2785                 u16 gso_segs = shinfo->gso_segs;
2786
2787                 /* mac layer + network layer */
2788                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2789
2790                 /* + transport layer */
2791                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2792                         hdr_len += tcp_hdrlen(skb);
2793                 else
2794                         hdr_len += sizeof(struct udphdr);
2795
2796                 if (shinfo->gso_type & SKB_GSO_DODGY)
2797                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2798                                                 shinfo->gso_size);
2799
2800                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2801         }
2802 }
2803
2804 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2805                                  struct net_device *dev,
2806                                  struct netdev_queue *txq)
2807 {
2808         spinlock_t *root_lock = qdisc_lock(q);
2809         bool contended;
2810         int rc;
2811
2812         qdisc_pkt_len_init(skb);
2813         qdisc_calculate_pkt_len(skb, q);
2814         /*
2815          * Heuristic to force contended enqueues to serialize on a
2816          * separate lock before trying to get qdisc main lock.
2817          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2818          * often and dequeue packets faster.
2819          */
2820         contended = qdisc_is_running(q);
2821         if (unlikely(contended))
2822                 spin_lock(&q->busylock);
2823
2824         spin_lock(root_lock);
2825         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2826                 kfree_skb(skb);
2827                 rc = NET_XMIT_DROP;
2828         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2829                    qdisc_run_begin(q)) {
2830                 /*
2831                  * This is a work-conserving queue; there are no old skbs
2832                  * waiting to be sent out; and the qdisc is not running -
2833                  * xmit the skb directly.
2834                  */
2835
2836                 qdisc_bstats_update(q, skb);
2837
2838                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2839                         if (unlikely(contended)) {
2840                                 spin_unlock(&q->busylock);
2841                                 contended = false;
2842                         }
2843                         __qdisc_run(q);
2844                 } else
2845                         qdisc_run_end(q);
2846
2847                 rc = NET_XMIT_SUCCESS;
2848         } else {
2849                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2850                 if (qdisc_run_begin(q)) {
2851                         if (unlikely(contended)) {
2852                                 spin_unlock(&q->busylock);
2853                                 contended = false;
2854                         }
2855                         __qdisc_run(q);
2856                 }
2857         }
2858         spin_unlock(root_lock);
2859         if (unlikely(contended))
2860                 spin_unlock(&q->busylock);
2861         return rc;
2862 }
2863
2864 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2865 static void skb_update_prio(struct sk_buff *skb)
2866 {
2867         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2868
2869         if (!skb->priority && skb->sk && map) {
2870                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2871
2872                 if (prioidx < map->priomap_len)
2873                         skb->priority = map->priomap[prioidx];
2874         }
2875 }
2876 #else
2877 #define skb_update_prio(skb)
2878 #endif
2879
2880 static DEFINE_PER_CPU(int, xmit_recursion);
2881 #define RECURSION_LIMIT 10
2882
2883 /**
2884  *      dev_loopback_xmit - loop back @skb
2885  *      @skb: buffer to transmit
2886  */
2887 int dev_loopback_xmit(struct sk_buff *skb)
2888 {
2889         skb_reset_mac_header(skb);
2890         __skb_pull(skb, skb_network_offset(skb));
2891         skb->pkt_type = PACKET_LOOPBACK;
2892         skb->ip_summed = CHECKSUM_UNNECESSARY;
2893         WARN_ON(!skb_dst(skb));
2894         skb_dst_force(skb);
2895         netif_rx_ni(skb);
2896         return 0;
2897 }
2898 EXPORT_SYMBOL(dev_loopback_xmit);
2899
2900 /**
2901  *      __dev_queue_xmit - transmit a buffer
2902  *      @skb: buffer to transmit
2903  *      @accel_priv: private data used for L2 forwarding offload
2904  *
2905  *      Queue a buffer for transmission to a network device. The caller must
2906  *      have set the device and priority and built the buffer before calling
2907  *      this function. The function can be called from an interrupt.
2908  *
2909  *      A negative errno code is returned on a failure. A success does not
2910  *      guarantee the frame will be transmitted as it may be dropped due
2911  *      to congestion or traffic shaping.
2912  *
2913  * -----------------------------------------------------------------------------------
2914  *      I notice this method can also return errors from the queue disciplines,
2915  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2916  *      be positive.
2917  *
2918  *      Regardless of the return value, the skb is consumed, so it is currently
2919  *      difficult to retry a send to this method.  (You can bump the ref count
2920  *      before sending to hold a reference for retry if you are careful.)
2921  *
2922  *      When calling this method, interrupts MUST be enabled.  This is because
2923  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2924  *          --BLG
2925  */
2926 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2927 {
2928         struct net_device *dev = skb->dev;
2929         struct netdev_queue *txq;
2930         struct Qdisc *q;
2931         int rc = -ENOMEM;
2932
2933         skb_reset_mac_header(skb);
2934
2935         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2936                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2937
2938         /* Disable soft irqs for various locks below. Also
2939          * stops preemption for RCU.
2940          */
2941         rcu_read_lock_bh();
2942
2943         skb_update_prio(skb);
2944
2945         /* If device/qdisc don't need skb->dst, release it right now while
2946          * its hot in this cpu cache.
2947          */
2948         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2949                 skb_dst_drop(skb);
2950         else
2951                 skb_dst_force(skb);
2952
2953         txq = netdev_pick_tx(dev, skb, accel_priv);
2954         q = rcu_dereference_bh(txq->qdisc);
2955
2956 #ifdef CONFIG_NET_CLS_ACT
2957         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2958 #endif
2959         trace_net_dev_queue(skb);
2960         if (q->enqueue) {
2961                 rc = __dev_xmit_skb(skb, q, dev, txq);
2962                 goto out;
2963         }
2964
2965         /* The device has no queue. Common case for software devices:
2966            loopback, all the sorts of tunnels...
2967
2968            Really, it is unlikely that netif_tx_lock protection is necessary
2969            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2970            counters.)
2971            However, it is possible, that they rely on protection
2972            made by us here.
2973
2974            Check this and shot the lock. It is not prone from deadlocks.
2975            Either shot noqueue qdisc, it is even simpler 8)
2976          */
2977         if (dev->flags & IFF_UP) {
2978                 int cpu = smp_processor_id(); /* ok because BHs are off */
2979
2980                 if (txq->xmit_lock_owner != cpu) {
2981
2982                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2983                                 goto recursion_alert;
2984
2985                         skb = validate_xmit_skb(skb, dev);
2986                         if (!skb)
2987                                 goto drop;
2988
2989                         HARD_TX_LOCK(dev, txq, cpu);
2990
2991                         if (!netif_xmit_stopped(txq)) {
2992                                 __this_cpu_inc(xmit_recursion);
2993                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2994                                 __this_cpu_dec(xmit_recursion);
2995                                 if (dev_xmit_complete(rc)) {
2996                                         HARD_TX_UNLOCK(dev, txq);
2997                                         goto out;
2998                                 }
2999                         }
3000                         HARD_TX_UNLOCK(dev, txq);
3001                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3002                                              dev->name);
3003                 } else {
3004                         /* Recursion is detected! It is possible,
3005                          * unfortunately
3006                          */
3007 recursion_alert:
3008                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3009                                              dev->name);
3010                 }
3011         }
3012
3013         rc = -ENETDOWN;
3014 drop:
3015         rcu_read_unlock_bh();
3016
3017         atomic_long_inc(&dev->tx_dropped);
3018         kfree_skb_list(skb);
3019         return rc;
3020 out:
3021         rcu_read_unlock_bh();
3022         return rc;
3023 }
3024
3025 int dev_queue_xmit(struct sk_buff *skb)
3026 {
3027         return __dev_queue_xmit(skb, NULL);
3028 }
3029 EXPORT_SYMBOL(dev_queue_xmit);
3030
3031 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3032 {
3033         return __dev_queue_xmit(skb, accel_priv);
3034 }
3035 EXPORT_SYMBOL(dev_queue_xmit_accel);
3036
3037
3038 /*=======================================================================
3039                         Receiver routines
3040   =======================================================================*/
3041
3042 int netdev_max_backlog __read_mostly = 1000;
3043 EXPORT_SYMBOL(netdev_max_backlog);
3044
3045 int netdev_tstamp_prequeue __read_mostly = 1;
3046 int netdev_budget __read_mostly = 300;
3047 int weight_p __read_mostly = 64;            /* old backlog weight */
3048
3049 /* Called with irq disabled */
3050 static inline void ____napi_schedule(struct softnet_data *sd,
3051                                      struct napi_struct *napi)
3052 {
3053         list_add_tail(&napi->poll_list, &sd->poll_list);
3054         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3055 }
3056
3057 #ifdef CONFIG_RPS
3058
3059 /* One global table that all flow-based protocols share. */
3060 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3061 EXPORT_SYMBOL(rps_sock_flow_table);
3062
3063 struct static_key rps_needed __read_mostly;
3064
3065 static struct rps_dev_flow *
3066 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3067             struct rps_dev_flow *rflow, u16 next_cpu)
3068 {
3069         if (next_cpu != RPS_NO_CPU) {
3070 #ifdef CONFIG_RFS_ACCEL
3071                 struct netdev_rx_queue *rxqueue;
3072                 struct rps_dev_flow_table *flow_table;
3073                 struct rps_dev_flow *old_rflow;
3074                 u32 flow_id;
3075                 u16 rxq_index;
3076                 int rc;
3077
3078                 /* Should we steer this flow to a different hardware queue? */
3079                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3080                     !(dev->features & NETIF_F_NTUPLE))
3081                         goto out;
3082                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3083                 if (rxq_index == skb_get_rx_queue(skb))
3084                         goto out;
3085
3086                 rxqueue = dev->_rx + rxq_index;
3087                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3088                 if (!flow_table)
3089                         goto out;
3090                 flow_id = skb_get_hash(skb) & flow_table->mask;
3091                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3092                                                         rxq_index, flow_id);
3093                 if (rc < 0)
3094                         goto out;
3095                 old_rflow = rflow;
3096                 rflow = &flow_table->flows[flow_id];
3097                 rflow->filter = rc;
3098                 if (old_rflow->filter == rflow->filter)
3099                         old_rflow->filter = RPS_NO_FILTER;
3100         out:
3101 #endif
3102                 rflow->last_qtail =
3103                         per_cpu(softnet_data, next_cpu).input_queue_head;
3104         }
3105
3106         rflow->cpu = next_cpu;
3107         return rflow;
3108 }
3109
3110 /*
3111  * get_rps_cpu is called from netif_receive_skb and returns the target
3112  * CPU from the RPS map of the receiving queue for a given skb.
3113  * rcu_read_lock must be held on entry.
3114  */
3115 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3116                        struct rps_dev_flow **rflowp)
3117 {
3118         struct netdev_rx_queue *rxqueue;
3119         struct rps_map *map;
3120         struct rps_dev_flow_table *flow_table;
3121         struct rps_sock_flow_table *sock_flow_table;
3122         int cpu = -1;
3123         u16 tcpu;
3124         u32 hash;
3125
3126         if (skb_rx_queue_recorded(skb)) {
3127                 u16 index = skb_get_rx_queue(skb);
3128                 if (unlikely(index >= dev->real_num_rx_queues)) {
3129                         WARN_ONCE(dev->real_num_rx_queues > 1,
3130                                   "%s received packet on queue %u, but number "
3131                                   "of RX queues is %u\n",
3132                                   dev->name, index, dev->real_num_rx_queues);
3133                         goto done;
3134                 }
3135                 rxqueue = dev->_rx + index;
3136         } else
3137                 rxqueue = dev->_rx;
3138
3139         map = rcu_dereference(rxqueue->rps_map);
3140         if (map) {
3141                 if (map->len == 1 &&
3142                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3143                         tcpu = map->cpus[0];
3144                         if (cpu_online(tcpu))
3145                                 cpu = tcpu;
3146                         goto done;
3147                 }
3148         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3149                 goto done;
3150         }
3151
3152         skb_reset_network_header(skb);
3153         hash = skb_get_hash(skb);
3154         if (!hash)
3155                 goto done;
3156
3157         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3158         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3159         if (flow_table && sock_flow_table) {
3160                 u16 next_cpu;
3161                 struct rps_dev_flow *rflow;
3162
3163                 rflow = &flow_table->flows[hash & flow_table->mask];
3164                 tcpu = rflow->cpu;
3165
3166                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3167
3168                 /*
3169                  * If the desired CPU (where last recvmsg was done) is
3170                  * different from current CPU (one in the rx-queue flow
3171                  * table entry), switch if one of the following holds:
3172                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3173                  *   - Current CPU is offline.
3174                  *   - The current CPU's queue tail has advanced beyond the
3175                  *     last packet that was enqueued using this table entry.
3176                  *     This guarantees that all previous packets for the flow
3177                  *     have been dequeued, thus preserving in order delivery.
3178                  */
3179                 if (unlikely(tcpu != next_cpu) &&
3180                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3181                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3182                       rflow->last_qtail)) >= 0)) {
3183                         tcpu = next_cpu;
3184                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3185                 }
3186
3187                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3188                         *rflowp = rflow;
3189                         cpu = tcpu;
3190                         goto done;
3191                 }
3192         }
3193
3194         if (map) {
3195                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3196                 if (cpu_online(tcpu)) {
3197                         cpu = tcpu;
3198                         goto done;
3199                 }
3200         }
3201
3202 done:
3203         return cpu;
3204 }
3205
3206 #ifdef CONFIG_RFS_ACCEL
3207
3208 /**
3209  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3210  * @dev: Device on which the filter was set
3211  * @rxq_index: RX queue index
3212  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3213  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3214  *
3215  * Drivers that implement ndo_rx_flow_steer() should periodically call
3216  * this function for each installed filter and remove the filters for
3217  * which it returns %true.
3218  */
3219 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3220                          u32 flow_id, u16 filter_id)
3221 {
3222         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3223         struct rps_dev_flow_table *flow_table;
3224         struct rps_dev_flow *rflow;
3225         bool expire = true;
3226         int cpu;
3227
3228         rcu_read_lock();
3229         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3230         if (flow_table && flow_id <= flow_table->mask) {
3231                 rflow = &flow_table->flows[flow_id];
3232                 cpu = ACCESS_ONCE(rflow->cpu);
3233                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3234                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3235                            rflow->last_qtail) <
3236                      (int)(10 * flow_table->mask)))
3237                         expire = false;
3238         }
3239         rcu_read_unlock();
3240         return expire;
3241 }
3242 EXPORT_SYMBOL(rps_may_expire_flow);
3243
3244 #endif /* CONFIG_RFS_ACCEL */
3245
3246 /* Called from hardirq (IPI) context */
3247 static void rps_trigger_softirq(void *data)
3248 {
3249         struct softnet_data *sd = data;
3250
3251         ____napi_schedule(sd, &sd->backlog);
3252         sd->received_rps++;
3253 }
3254
3255 #endif /* CONFIG_RPS */
3256
3257 /*
3258  * Check if this softnet_data structure is another cpu one
3259  * If yes, queue it to our IPI list and return 1
3260  * If no, return 0
3261  */
3262 static int rps_ipi_queued(struct softnet_data *sd)
3263 {
3264 #ifdef CONFIG_RPS
3265         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3266
3267         if (sd != mysd) {
3268                 sd->rps_ipi_next = mysd->rps_ipi_list;
3269                 mysd->rps_ipi_list = sd;
3270
3271                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3272                 return 1;
3273         }
3274 #endif /* CONFIG_RPS */
3275         return 0;
3276 }
3277
3278 #ifdef CONFIG_NET_FLOW_LIMIT
3279 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3280 #endif
3281
3282 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3283 {
3284 #ifdef CONFIG_NET_FLOW_LIMIT
3285         struct sd_flow_limit *fl;
3286         struct softnet_data *sd;
3287         unsigned int old_flow, new_flow;
3288
3289         if (qlen < (netdev_max_backlog >> 1))
3290                 return false;
3291
3292         sd = this_cpu_ptr(&softnet_data);
3293
3294         rcu_read_lock();
3295         fl = rcu_dereference(sd->flow_limit);
3296         if (fl) {
3297                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3298                 old_flow = fl->history[fl->history_head];
3299                 fl->history[fl->history_head] = new_flow;
3300
3301                 fl->history_head++;
3302                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3303
3304                 if (likely(fl->buckets[old_flow]))
3305                         fl->buckets[old_flow]--;
3306
3307                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3308                         fl->count++;
3309                         rcu_read_unlock();
3310                         return true;
3311                 }
3312         }
3313         rcu_read_unlock();
3314 #endif
3315         return false;
3316 }
3317
3318 /*
3319  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3320  * queue (may be a remote CPU queue).
3321  */
3322 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3323                               unsigned int *qtail)
3324 {
3325         struct softnet_data *sd;
3326         unsigned long flags;
3327         unsigned int qlen;
3328
3329         sd = &per_cpu(softnet_data, cpu);
3330
3331         local_irq_save(flags);
3332
3333         rps_lock(sd);
3334         qlen = skb_queue_len(&sd->input_pkt_queue);
3335         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3336                 if (qlen) {
3337 enqueue:
3338                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3339                         input_queue_tail_incr_save(sd, qtail);
3340                         rps_unlock(sd);
3341                         local_irq_restore(flags);
3342                         return NET_RX_SUCCESS;
3343                 }
3344
3345                 /* Schedule NAPI for backlog device
3346                  * We can use non atomic operation since we own the queue lock
3347                  */
3348                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3349                         if (!rps_ipi_queued(sd))
3350                                 ____napi_schedule(sd, &sd->backlog);
3351                 }
3352                 goto enqueue;
3353         }
3354
3355         sd->dropped++;
3356         rps_unlock(sd);
3357
3358         local_irq_restore(flags);
3359
3360         atomic_long_inc(&skb->dev->rx_dropped);
3361         kfree_skb(skb);
3362         return NET_RX_DROP;
3363 }
3364
3365 static int netif_rx_internal(struct sk_buff *skb)
3366 {
3367         int ret;
3368
3369         net_timestamp_check(netdev_tstamp_prequeue, skb);
3370
3371         trace_netif_rx(skb);
3372 #ifdef CONFIG_RPS
3373         if (static_key_false(&rps_needed)) {
3374                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3375                 int cpu;
3376
3377                 preempt_disable();
3378                 rcu_read_lock();
3379
3380                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3381                 if (cpu < 0)
3382                         cpu = smp_processor_id();
3383
3384                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3385
3386                 rcu_read_unlock();
3387                 preempt_enable();
3388         } else
3389 #endif
3390         {
3391                 unsigned int qtail;
3392                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3393                 put_cpu();
3394         }
3395         return ret;
3396 }
3397
3398 /**
3399  *      netif_rx        -       post buffer to the network code
3400  *      @skb: buffer to post
3401  *
3402  *      This function receives a packet from a device driver and queues it for
3403  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3404  *      may be dropped during processing for congestion control or by the
3405  *      protocol layers.
3406  *
3407  *      return values:
3408  *      NET_RX_SUCCESS  (no congestion)
3409  *      NET_RX_DROP     (packet was dropped)
3410  *
3411  */
3412
3413 int netif_rx(struct sk_buff *skb)
3414 {
3415         trace_netif_rx_entry(skb);
3416
3417         return netif_rx_internal(skb);
3418 }
3419 EXPORT_SYMBOL(netif_rx);
3420
3421 int netif_rx_ni(struct sk_buff *skb)
3422 {
3423         int err;
3424
3425         trace_netif_rx_ni_entry(skb);
3426
3427         preempt_disable();
3428         err = netif_rx_internal(skb);
3429         if (local_softirq_pending())
3430                 do_softirq();
3431         preempt_enable();
3432
3433         return err;
3434 }
3435 EXPORT_SYMBOL(netif_rx_ni);
3436
3437 static void net_tx_action(struct softirq_action *h)
3438 {
3439         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3440
3441         if (sd->completion_queue) {
3442                 struct sk_buff *clist;
3443
3444                 local_irq_disable();
3445                 clist = sd->completion_queue;
3446                 sd->completion_queue = NULL;
3447                 local_irq_enable();
3448
3449                 while (clist) {
3450                         struct sk_buff *skb = clist;
3451                         clist = clist->next;
3452
3453                         WARN_ON(atomic_read(&skb->users));
3454                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3455                                 trace_consume_skb(skb);
3456                         else
3457                                 trace_kfree_skb(skb, net_tx_action);
3458                         __kfree_skb(skb);
3459                 }
3460         }
3461
3462         if (sd->output_queue) {
3463                 struct Qdisc *head;
3464
3465                 local_irq_disable();
3466                 head = sd->output_queue;
3467                 sd->output_queue = NULL;
3468                 sd->output_queue_tailp = &sd->output_queue;
3469                 local_irq_enable();
3470
3471                 while (head) {
3472                         struct Qdisc *q = head;
3473                         spinlock_t *root_lock;
3474
3475                         head = head->next_sched;
3476
3477                         root_lock = qdisc_lock(q);
3478                         if (spin_trylock(root_lock)) {
3479                                 smp_mb__before_atomic();
3480                                 clear_bit(__QDISC_STATE_SCHED,
3481                                           &q->state);
3482                                 qdisc_run(q);
3483                                 spin_unlock(root_lock);
3484                         } else {
3485                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3486                                               &q->state)) {
3487                                         __netif_reschedule(q);
3488                                 } else {
3489                                         smp_mb__before_atomic();
3490                                         clear_bit(__QDISC_STATE_SCHED,
3491                                                   &q->state);
3492                                 }
3493                         }
3494                 }
3495         }
3496 }
3497
3498 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3499     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3500 /* This hook is defined here for ATM LANE */
3501 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3502                              unsigned char *addr) __read_mostly;
3503 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3504 #endif
3505
3506 #ifdef CONFIG_NET_CLS_ACT
3507 /* TODO: Maybe we should just force sch_ingress to be compiled in
3508  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3509  * a compare and 2 stores extra right now if we dont have it on
3510  * but have CONFIG_NET_CLS_ACT
3511  * NOTE: This doesn't stop any functionality; if you dont have
3512  * the ingress scheduler, you just can't add policies on ingress.
3513  *
3514  */
3515 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3516 {
3517         struct net_device *dev = skb->dev;
3518         u32 ttl = G_TC_RTTL(skb->tc_verd);
3519         int result = TC_ACT_OK;
3520         struct Qdisc *q;
3521
3522         if (unlikely(MAX_RED_LOOP < ttl++)) {
3523                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3524                                      skb->skb_iif, dev->ifindex);
3525                 return TC_ACT_SHOT;
3526         }
3527
3528         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3529         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3530
3531         q = rcu_dereference(rxq->qdisc);
3532         if (q != &noop_qdisc) {
3533                 spin_lock(qdisc_lock(q));
3534                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3535                         result = qdisc_enqueue_root(skb, q);
3536                 spin_unlock(qdisc_lock(q));
3537         }
3538
3539         return result;
3540 }
3541
3542 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3543                                          struct packet_type **pt_prev,
3544                                          int *ret, struct net_device *orig_dev)
3545 {
3546         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3547
3548         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3549                 goto out;
3550
3551         if (*pt_prev) {
3552                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3553                 *pt_prev = NULL;
3554         }
3555
3556         switch (ing_filter(skb, rxq)) {
3557         case TC_ACT_SHOT:
3558         case TC_ACT_STOLEN:
3559                 kfree_skb(skb);
3560                 return NULL;
3561         }
3562
3563 out:
3564         skb->tc_verd = 0;
3565         return skb;
3566 }
3567 #endif
3568
3569 /**
3570  *      netdev_rx_handler_register - register receive handler
3571  *      @dev: device to register a handler for
3572  *      @rx_handler: receive handler to register
3573  *      @rx_handler_data: data pointer that is used by rx handler
3574  *
3575  *      Register a receive handler for a device. This handler will then be
3576  *      called from __netif_receive_skb. A negative errno code is returned
3577  *      on a failure.
3578  *
3579  *      The caller must hold the rtnl_mutex.
3580  *
3581  *      For a general description of rx_handler, see enum rx_handler_result.
3582  */
3583 int netdev_rx_handler_register(struct net_device *dev,
3584                                rx_handler_func_t *rx_handler,
3585                                void *rx_handler_data)
3586 {
3587         ASSERT_RTNL();
3588
3589         if (dev->rx_handler)
3590                 return -EBUSY;
3591
3592         /* Note: rx_handler_data must be set before rx_handler */
3593         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3594         rcu_assign_pointer(dev->rx_handler, rx_handler);
3595
3596         return 0;
3597 }
3598 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3599
3600 /**
3601  *      netdev_rx_handler_unregister - unregister receive handler
3602  *      @dev: device to unregister a handler from
3603  *
3604  *      Unregister a receive handler from a device.
3605  *
3606  *      The caller must hold the rtnl_mutex.
3607  */
3608 void netdev_rx_handler_unregister(struct net_device *dev)
3609 {
3610
3611         ASSERT_RTNL();
3612         RCU_INIT_POINTER(dev->rx_handler, NULL);
3613         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3614          * section has a guarantee to see a non NULL rx_handler_data
3615          * as well.
3616          */
3617         synchronize_net();
3618         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3619 }
3620 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3621
3622 /*
3623  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3624  * the special handling of PFMEMALLOC skbs.
3625  */
3626 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3627 {
3628         switch (skb->protocol) {
3629         case htons(ETH_P_ARP):
3630         case htons(ETH_P_IP):
3631         case htons(ETH_P_IPV6):
3632         case htons(ETH_P_8021Q):
3633         case htons(ETH_P_8021AD):
3634                 return true;
3635         default:
3636                 return false;
3637         }
3638 }
3639
3640 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3641 {
3642         struct packet_type *ptype, *pt_prev;
3643         rx_handler_func_t *rx_handler;
3644         struct net_device *orig_dev;
3645         bool deliver_exact = false;
3646         int ret = NET_RX_DROP;
3647         __be16 type;
3648
3649         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3650
3651         trace_netif_receive_skb(skb);
3652
3653         orig_dev = skb->dev;
3654
3655         skb_reset_network_header(skb);
3656         if (!skb_transport_header_was_set(skb))
3657                 skb_reset_transport_header(skb);
3658         skb_reset_mac_len(skb);
3659
3660         pt_prev = NULL;
3661
3662         rcu_read_lock();
3663
3664 another_round:
3665         skb->skb_iif = skb->dev->ifindex;
3666
3667         __this_cpu_inc(softnet_data.processed);
3668
3669         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3670             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3671                 skb = skb_vlan_untag(skb);
3672                 if (unlikely(!skb))
3673                         goto unlock;
3674         }
3675
3676 #ifdef CONFIG_NET_CLS_ACT
3677         if (skb->tc_verd & TC_NCLS) {
3678                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3679                 goto ncls;
3680         }
3681 #endif
3682
3683         if (pfmemalloc)
3684                 goto skip_taps;
3685
3686         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3687                 if (pt_prev)
3688                         ret = deliver_skb(skb, pt_prev, orig_dev);
3689                 pt_prev = ptype;
3690         }
3691
3692         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3693                 if (pt_prev)
3694                         ret = deliver_skb(skb, pt_prev, orig_dev);
3695                 pt_prev = ptype;
3696         }
3697
3698 skip_taps:
3699 #ifdef CONFIG_NET_CLS_ACT
3700         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3701         if (!skb)
3702                 goto unlock;
3703 ncls:
3704 #endif
3705
3706         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3707                 goto drop;
3708
3709         if (skb_vlan_tag_present(skb)) {
3710                 if (pt_prev) {
3711                         ret = deliver_skb(skb, pt_prev, orig_dev);
3712                         pt_prev = NULL;
3713                 }
3714                 if (vlan_do_receive(&skb))
3715                         goto another_round;
3716                 else if (unlikely(!skb))
3717                         goto unlock;
3718         }
3719
3720         rx_handler = rcu_dereference(skb->dev->rx_handler);
3721         if (rx_handler) {
3722                 if (pt_prev) {
3723                         ret = deliver_skb(skb, pt_prev, orig_dev);
3724                         pt_prev = NULL;
3725                 }
3726                 switch (rx_handler(&skb)) {
3727                 case RX_HANDLER_CONSUMED:
3728                         ret = NET_RX_SUCCESS;
3729                         goto unlock;
3730                 case RX_HANDLER_ANOTHER:
3731                         goto another_round;
3732                 case RX_HANDLER_EXACT:
3733                         deliver_exact = true;
3734                 case RX_HANDLER_PASS:
3735                         break;
3736                 default:
3737                         BUG();
3738                 }
3739         }
3740
3741         if (unlikely(skb_vlan_tag_present(skb))) {
3742                 if (skb_vlan_tag_get_id(skb))
3743                         skb->pkt_type = PACKET_OTHERHOST;
3744                 /* Note: we might in the future use prio bits
3745                  * and set skb->priority like in vlan_do_receive()
3746                  * For the time being, just ignore Priority Code Point
3747                  */
3748                 skb->vlan_tci = 0;
3749         }
3750
3751         type = skb->protocol;
3752
3753         /* deliver only exact match when indicated */
3754         if (likely(!deliver_exact)) {
3755                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3756                                        &ptype_base[ntohs(type) &
3757                                                    PTYPE_HASH_MASK]);
3758         }
3759
3760         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3761                                &orig_dev->ptype_specific);
3762
3763         if (unlikely(skb->dev != orig_dev)) {
3764                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3765                                        &skb->dev->ptype_specific);
3766         }
3767
3768         if (pt_prev) {
3769                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3770                         goto drop;
3771                 else
3772                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3773         } else {
3774 drop:
3775                 atomic_long_inc(&skb->dev->rx_dropped);
3776                 kfree_skb(skb);
3777                 /* Jamal, now you will not able to escape explaining
3778                  * me how you were going to use this. :-)
3779                  */
3780                 ret = NET_RX_DROP;
3781         }
3782
3783 unlock:
3784         rcu_read_unlock();
3785         return ret;
3786 }
3787
3788 static int __netif_receive_skb(struct sk_buff *skb)
3789 {
3790         int ret;
3791
3792         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3793                 unsigned long pflags = current->flags;
3794
3795                 /*
3796                  * PFMEMALLOC skbs are special, they should
3797                  * - be delivered to SOCK_MEMALLOC sockets only
3798                  * - stay away from userspace
3799                  * - have bounded memory usage
3800                  *
3801                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3802                  * context down to all allocation sites.
3803                  */
3804                 current->flags |= PF_MEMALLOC;
3805                 ret = __netif_receive_skb_core(skb, true);
3806                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3807         } else
3808                 ret = __netif_receive_skb_core(skb, false);
3809
3810         return ret;
3811 }
3812
3813 static int netif_receive_skb_internal(struct sk_buff *skb)
3814 {
3815         net_timestamp_check(netdev_tstamp_prequeue, skb);
3816
3817         if (skb_defer_rx_timestamp(skb))
3818                 return NET_RX_SUCCESS;
3819
3820 #ifdef CONFIG_RPS
3821         if (static_key_false(&rps_needed)) {
3822                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3823                 int cpu, ret;
3824
3825                 rcu_read_lock();
3826
3827                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3828
3829                 if (cpu >= 0) {
3830                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3831                         rcu_read_unlock();
3832                         return ret;
3833                 }
3834                 rcu_read_unlock();
3835         }
3836 #endif
3837         return __netif_receive_skb(skb);
3838 }
3839
3840 /**
3841  *      netif_receive_skb - process receive buffer from network
3842  *      @skb: buffer to process
3843  *
3844  *      netif_receive_skb() is the main receive data processing function.
3845  *      It always succeeds. The buffer may be dropped during processing
3846  *      for congestion control or by the protocol layers.
3847  *
3848  *      This function may only be called from softirq context and interrupts
3849  *      should be enabled.
3850  *
3851  *      Return values (usually ignored):
3852  *      NET_RX_SUCCESS: no congestion
3853  *      NET_RX_DROP: packet was dropped
3854  */
3855 int netif_receive_skb(struct sk_buff *skb)
3856 {
3857         trace_netif_receive_skb_entry(skb);
3858
3859         return netif_receive_skb_internal(skb);
3860 }
3861 EXPORT_SYMBOL(netif_receive_skb);
3862
3863 /* Network device is going away, flush any packets still pending
3864  * Called with irqs disabled.
3865  */
3866 static void flush_backlog(void *arg)
3867 {
3868         struct net_device *dev = arg;
3869         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3870         struct sk_buff *skb, *tmp;
3871
3872         rps_lock(sd);
3873         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3874                 if (skb->dev == dev) {
3875                         __skb_unlink(skb, &sd->input_pkt_queue);
3876                         kfree_skb(skb);
3877                         input_queue_head_incr(sd);
3878                 }
3879         }
3880         rps_unlock(sd);
3881
3882         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3883                 if (skb->dev == dev) {
3884                         __skb_unlink(skb, &sd->process_queue);
3885                         kfree_skb(skb);
3886                         input_queue_head_incr(sd);
3887                 }
3888         }
3889 }
3890
3891 static int napi_gro_complete(struct sk_buff *skb)
3892 {
3893         struct packet_offload *ptype;
3894         __be16 type = skb->protocol;
3895         struct list_head *head = &offload_base;
3896         int err = -ENOENT;
3897
3898         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3899
3900         if (NAPI_GRO_CB(skb)->count == 1) {
3901                 skb_shinfo(skb)->gso_size = 0;
3902                 goto out;
3903         }
3904
3905         rcu_read_lock();
3906         list_for_each_entry_rcu(ptype, head, list) {
3907                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3908                         continue;
3909
3910                 err = ptype->callbacks.gro_complete(skb, 0);
3911                 break;
3912         }
3913         rcu_read_unlock();
3914
3915         if (err) {
3916                 WARN_ON(&ptype->list == head);
3917                 kfree_skb(skb);
3918                 return NET_RX_SUCCESS;
3919         }
3920
3921 out:
3922         return netif_receive_skb_internal(skb);
3923 }
3924
3925 /* napi->gro_list contains packets ordered by age.
3926  * youngest packets at the head of it.
3927  * Complete skbs in reverse order to reduce latencies.
3928  */
3929 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3930 {
3931         struct sk_buff *skb, *prev = NULL;
3932
3933         /* scan list and build reverse chain */
3934         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3935                 skb->prev = prev;
3936                 prev = skb;
3937         }
3938
3939         for (skb = prev; skb; skb = prev) {
3940                 skb->next = NULL;
3941
3942                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3943                         return;
3944
3945                 prev = skb->prev;
3946                 napi_gro_complete(skb);
3947                 napi->gro_count--;
3948         }
3949
3950         napi->gro_list = NULL;
3951 }
3952 EXPORT_SYMBOL(napi_gro_flush);
3953
3954 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3955 {
3956         struct sk_buff *p;
3957         unsigned int maclen = skb->dev->hard_header_len;
3958         u32 hash = skb_get_hash_raw(skb);
3959
3960         for (p = napi->gro_list; p; p = p->next) {
3961                 unsigned long diffs;
3962
3963                 NAPI_GRO_CB(p)->flush = 0;
3964
3965                 if (hash != skb_get_hash_raw(p)) {
3966                         NAPI_GRO_CB(p)->same_flow = 0;
3967                         continue;
3968                 }
3969
3970                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3971                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3972                 if (maclen == ETH_HLEN)
3973                         diffs |= compare_ether_header(skb_mac_header(p),
3974                                                       skb_mac_header(skb));
3975                 else if (!diffs)
3976                         diffs = memcmp(skb_mac_header(p),
3977                                        skb_mac_header(skb),
3978                                        maclen);
3979                 NAPI_GRO_CB(p)->same_flow = !diffs;
3980         }
3981 }
3982
3983 static void skb_gro_reset_offset(struct sk_buff *skb)
3984 {
3985         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3986         const skb_frag_t *frag0 = &pinfo->frags[0];
3987
3988         NAPI_GRO_CB(skb)->data_offset = 0;
3989         NAPI_GRO_CB(skb)->frag0 = NULL;
3990         NAPI_GRO_CB(skb)->frag0_len = 0;
3991
3992         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3993             pinfo->nr_frags &&
3994             !PageHighMem(skb_frag_page(frag0))) {
3995                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3996                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3997         }
3998 }
3999
4000 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4001 {
4002         struct skb_shared_info *pinfo = skb_shinfo(skb);
4003
4004         BUG_ON(skb->end - skb->tail < grow);
4005
4006         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4007
4008         skb->data_len -= grow;
4009         skb->tail += grow;
4010
4011         pinfo->frags[0].page_offset += grow;
4012         skb_frag_size_sub(&pinfo->frags[0], grow);
4013
4014         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4015                 skb_frag_unref(skb, 0);
4016                 memmove(pinfo->frags, pinfo->frags + 1,
4017                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4018         }
4019 }
4020
4021 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4022 {
4023         struct sk_buff **pp = NULL;
4024         struct packet_offload *ptype;
4025         __be16 type = skb->protocol;
4026         struct list_head *head = &offload_base;
4027         int same_flow;
4028         enum gro_result ret;
4029         int grow;
4030
4031         if (!(skb->dev->features & NETIF_F_GRO))
4032                 goto normal;
4033
4034         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4035                 goto normal;
4036
4037         gro_list_prepare(napi, skb);
4038
4039         rcu_read_lock();
4040         list_for_each_entry_rcu(ptype, head, list) {
4041                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4042                         continue;
4043
4044                 skb_set_network_header(skb, skb_gro_offset(skb));
4045                 skb_reset_mac_len(skb);
4046                 NAPI_GRO_CB(skb)->same_flow = 0;
4047                 NAPI_GRO_CB(skb)->flush = 0;
4048                 NAPI_GRO_CB(skb)->free = 0;
4049                 NAPI_GRO_CB(skb)->udp_mark = 0;
4050
4051                 /* Setup for GRO checksum validation */
4052                 switch (skb->ip_summed) {
4053                 case CHECKSUM_COMPLETE:
4054                         NAPI_GRO_CB(skb)->csum = skb->csum;
4055                         NAPI_GRO_CB(skb)->csum_valid = 1;
4056                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4057                         break;
4058                 case CHECKSUM_UNNECESSARY:
4059                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4060                         NAPI_GRO_CB(skb)->csum_valid = 0;
4061                         break;
4062                 default:
4063                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4064                         NAPI_GRO_CB(skb)->csum_valid = 0;
4065                 }
4066
4067                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4068                 break;
4069         }
4070         rcu_read_unlock();
4071
4072         if (&ptype->list == head)
4073                 goto normal;
4074
4075         same_flow = NAPI_GRO_CB(skb)->same_flow;
4076         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4077
4078         if (pp) {
4079                 struct sk_buff *nskb = *pp;
4080
4081                 *pp = nskb->next;
4082                 nskb->next = NULL;
4083                 napi_gro_complete(nskb);
4084                 napi->gro_count--;
4085         }
4086
4087         if (same_flow)
4088                 goto ok;
4089
4090         if (NAPI_GRO_CB(skb)->flush)
4091                 goto normal;
4092
4093         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4094                 struct sk_buff *nskb = napi->gro_list;
4095
4096                 /* locate the end of the list to select the 'oldest' flow */
4097                 while (nskb->next) {
4098                         pp = &nskb->next;
4099                         nskb = *pp;
4100                 }
4101                 *pp = NULL;
4102                 nskb->next = NULL;
4103                 napi_gro_complete(nskb);
4104         } else {
4105                 napi->gro_count++;
4106         }
4107         NAPI_GRO_CB(skb)->count = 1;
4108         NAPI_GRO_CB(skb)->age = jiffies;
4109         NAPI_GRO_CB(skb)->last = skb;
4110         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4111         skb->next = napi->gro_list;
4112         napi->gro_list = skb;
4113         ret = GRO_HELD;
4114
4115 pull:
4116         grow = skb_gro_offset(skb) - skb_headlen(skb);
4117         if (grow > 0)
4118                 gro_pull_from_frag0(skb, grow);
4119 ok:
4120         return ret;
4121
4122 normal:
4123         ret = GRO_NORMAL;
4124         goto pull;
4125 }
4126
4127 struct packet_offload *gro_find_receive_by_type(__be16 type)
4128 {
4129         struct list_head *offload_head = &offload_base;
4130         struct packet_offload *ptype;
4131
4132         list_for_each_entry_rcu(ptype, offload_head, list) {
4133                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4134                         continue;
4135                 return ptype;
4136         }
4137         return NULL;
4138 }
4139 EXPORT_SYMBOL(gro_find_receive_by_type);
4140
4141 struct packet_offload *gro_find_complete_by_type(__be16 type)
4142 {
4143         struct list_head *offload_head = &offload_base;
4144         struct packet_offload *ptype;
4145
4146         list_for_each_entry_rcu(ptype, offload_head, list) {
4147                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4148                         continue;
4149                 return ptype;
4150         }
4151         return NULL;
4152 }
4153 EXPORT_SYMBOL(gro_find_complete_by_type);
4154
4155 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4156 {
4157         switch (ret) {
4158         case GRO_NORMAL:
4159                 if (netif_receive_skb_internal(skb))
4160                         ret = GRO_DROP;
4161                 break;
4162
4163         case GRO_DROP:
4164                 kfree_skb(skb);
4165                 break;
4166
4167         case GRO_MERGED_FREE:
4168                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4169                         kmem_cache_free(skbuff_head_cache, skb);
4170                 else
4171                         __kfree_skb(skb);
4172                 break;
4173
4174         case GRO_HELD:
4175         case GRO_MERGED:
4176                 break;
4177         }
4178
4179         return ret;
4180 }
4181
4182 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4183 {
4184         trace_napi_gro_receive_entry(skb);
4185
4186         skb_gro_reset_offset(skb);
4187
4188         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4189 }
4190 EXPORT_SYMBOL(napi_gro_receive);
4191
4192 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4193 {
4194         if (unlikely(skb->pfmemalloc)) {
4195                 consume_skb(skb);
4196                 return;
4197         }
4198         __skb_pull(skb, skb_headlen(skb));
4199         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4200         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4201         skb->vlan_tci = 0;
4202         skb->dev = napi->dev;
4203         skb->skb_iif = 0;
4204         skb->encapsulation = 0;
4205         skb_shinfo(skb)->gso_type = 0;
4206         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4207
4208         napi->skb = skb;
4209 }
4210
4211 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4212 {
4213         struct sk_buff *skb = napi->skb;
4214
4215         if (!skb) {
4216                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4217                 napi->skb = skb;
4218         }
4219         return skb;
4220 }
4221 EXPORT_SYMBOL(napi_get_frags);
4222
4223 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4224                                       struct sk_buff *skb,
4225                                       gro_result_t ret)
4226 {
4227         switch (ret) {
4228         case GRO_NORMAL:
4229         case GRO_HELD:
4230                 __skb_push(skb, ETH_HLEN);
4231                 skb->protocol = eth_type_trans(skb, skb->dev);
4232                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4233                         ret = GRO_DROP;
4234                 break;
4235
4236         case GRO_DROP:
4237         case GRO_MERGED_FREE:
4238                 napi_reuse_skb(napi, skb);
4239                 break;
4240
4241         case GRO_MERGED:
4242                 break;
4243         }
4244
4245         return ret;
4246 }
4247
4248 /* Upper GRO stack assumes network header starts at gro_offset=0
4249  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4250  * We copy ethernet header into skb->data to have a common layout.
4251  */
4252 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4253 {
4254         struct sk_buff *skb = napi->skb;
4255         const struct ethhdr *eth;
4256         unsigned int hlen = sizeof(*eth);
4257
4258         napi->skb = NULL;
4259
4260         skb_reset_mac_header(skb);
4261         skb_gro_reset_offset(skb);
4262
4263         eth = skb_gro_header_fast(skb, 0);
4264         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4265                 eth = skb_gro_header_slow(skb, hlen, 0);
4266                 if (unlikely(!eth)) {
4267                         napi_reuse_skb(napi, skb);
4268                         return NULL;
4269                 }
4270         } else {
4271                 gro_pull_from_frag0(skb, hlen);
4272                 NAPI_GRO_CB(skb)->frag0 += hlen;
4273                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4274         }
4275         __skb_pull(skb, hlen);
4276
4277         /*
4278          * This works because the only protocols we care about don't require
4279          * special handling.
4280          * We'll fix it up properly in napi_frags_finish()
4281          */
4282         skb->protocol = eth->h_proto;
4283
4284         return skb;
4285 }
4286
4287 gro_result_t napi_gro_frags(struct napi_struct *napi)
4288 {
4289         struct sk_buff *skb = napi_frags_skb(napi);
4290
4291         if (!skb)
4292                 return GRO_DROP;
4293
4294         trace_napi_gro_frags_entry(skb);
4295
4296         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4297 }
4298 EXPORT_SYMBOL(napi_gro_frags);
4299
4300 /* Compute the checksum from gro_offset and return the folded value
4301  * after adding in any pseudo checksum.
4302  */
4303 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4304 {
4305         __wsum wsum;
4306         __sum16 sum;
4307
4308         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4309
4310         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4311         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4312         if (likely(!sum)) {
4313                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4314                     !skb->csum_complete_sw)
4315                         netdev_rx_csum_fault(skb->dev);
4316         }
4317
4318         NAPI_GRO_CB(skb)->csum = wsum;
4319         NAPI_GRO_CB(skb)->csum_valid = 1;
4320
4321         return sum;
4322 }
4323 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4324
4325 /*
4326  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4327  * Note: called with local irq disabled, but exits with local irq enabled.
4328  */
4329 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4330 {
4331 #ifdef CONFIG_RPS
4332         struct softnet_data *remsd = sd->rps_ipi_list;
4333
4334         if (remsd) {
4335                 sd->rps_ipi_list = NULL;
4336
4337                 local_irq_enable();
4338
4339                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4340                 while (remsd) {
4341                         struct softnet_data *next = remsd->rps_ipi_next;
4342
4343                         if (cpu_online(remsd->cpu))
4344                                 smp_call_function_single_async(remsd->cpu,
4345                                                            &remsd->csd);
4346                         remsd = next;
4347                 }
4348         } else
4349 #endif
4350                 local_irq_enable();
4351 }
4352
4353 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4354 {
4355 #ifdef CONFIG_RPS
4356         return sd->rps_ipi_list != NULL;
4357 #else
4358         return false;
4359 #endif
4360 }
4361
4362 static int process_backlog(struct napi_struct *napi, int quota)
4363 {
4364         int work = 0;
4365         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4366
4367         /* Check if we have pending ipi, its better to send them now,
4368          * not waiting net_rx_action() end.
4369          */
4370         if (sd_has_rps_ipi_waiting(sd)) {
4371                 local_irq_disable();
4372                 net_rps_action_and_irq_enable(sd);
4373         }
4374
4375         napi->weight = weight_p;
4376         local_irq_disable();
4377         while (1) {
4378                 struct sk_buff *skb;
4379
4380                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4381                         local_irq_enable();
4382                         __netif_receive_skb(skb);
4383                         local_irq_disable();
4384                         input_queue_head_incr(sd);
4385                         if (++work >= quota) {
4386                                 local_irq_enable();
4387                                 return work;
4388                         }
4389                 }
4390
4391                 rps_lock(sd);
4392                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4393                         /*
4394                          * Inline a custom version of __napi_complete().
4395                          * only current cpu owns and manipulates this napi,
4396                          * and NAPI_STATE_SCHED is the only possible flag set
4397                          * on backlog.
4398                          * We can use a plain write instead of clear_bit(),
4399                          * and we dont need an smp_mb() memory barrier.
4400                          */
4401                         napi->state = 0;
4402                         rps_unlock(sd);
4403
4404                         break;
4405                 }
4406
4407                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4408                                            &sd->process_queue);
4409                 rps_unlock(sd);
4410         }
4411         local_irq_enable();
4412
4413         return work;
4414 }
4415
4416 /**
4417  * __napi_schedule - schedule for receive
4418  * @n: entry to schedule
4419  *
4420  * The entry's receive function will be scheduled to run.
4421  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4422  */
4423 void __napi_schedule(struct napi_struct *n)
4424 {
4425         unsigned long flags;
4426
4427         local_irq_save(flags);
4428         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4429         local_irq_restore(flags);
4430 }
4431 EXPORT_SYMBOL(__napi_schedule);
4432
4433 /**
4434  * __napi_schedule_irqoff - schedule for receive
4435  * @n: entry to schedule
4436  *
4437  * Variant of __napi_schedule() assuming hard irqs are masked
4438  */
4439 void __napi_schedule_irqoff(struct napi_struct *n)
4440 {
4441         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4442 }
4443 EXPORT_SYMBOL(__napi_schedule_irqoff);
4444
4445 void __napi_complete(struct napi_struct *n)
4446 {
4447         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4448
4449         list_del_init(&n->poll_list);
4450         smp_mb__before_atomic();
4451         clear_bit(NAPI_STATE_SCHED, &n->state);
4452 }
4453 EXPORT_SYMBOL(__napi_complete);
4454
4455 void napi_complete_done(struct napi_struct *n, int work_done)
4456 {
4457         unsigned long flags;
4458
4459         /*
4460          * don't let napi dequeue from the cpu poll list
4461          * just in case its running on a different cpu
4462          */
4463         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4464                 return;
4465
4466         if (n->gro_list) {
4467                 unsigned long timeout = 0;
4468
4469                 if (work_done)
4470                         timeout = n->dev->gro_flush_timeout;
4471
4472                 if (timeout)
4473                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4474                                       HRTIMER_MODE_REL_PINNED);
4475                 else
4476                         napi_gro_flush(n, false);
4477         }
4478         if (likely(list_empty(&n->poll_list))) {
4479                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4480         } else {
4481                 /* If n->poll_list is not empty, we need to mask irqs */
4482                 local_irq_save(flags);
4483                 __napi_complete(n);
4484                 local_irq_restore(flags);
4485         }
4486 }
4487 EXPORT_SYMBOL(napi_complete_done);
4488
4489 /* must be called under rcu_read_lock(), as we dont take a reference */
4490 struct napi_struct *napi_by_id(unsigned int napi_id)
4491 {
4492         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4493         struct napi_struct *napi;
4494
4495         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4496                 if (napi->napi_id == napi_id)
4497                         return napi;
4498
4499         return NULL;
4500 }
4501 EXPORT_SYMBOL_GPL(napi_by_id);
4502
4503 void napi_hash_add(struct napi_struct *napi)
4504 {
4505         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4506
4507                 spin_lock(&napi_hash_lock);
4508
4509                 /* 0 is not a valid id, we also skip an id that is taken
4510                  * we expect both events to be extremely rare
4511                  */
4512                 napi->napi_id = 0;
4513                 while (!napi->napi_id) {
4514                         napi->napi_id = ++napi_gen_id;
4515                         if (napi_by_id(napi->napi_id))
4516                                 napi->napi_id = 0;
4517                 }
4518
4519                 hlist_add_head_rcu(&napi->napi_hash_node,
4520                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4521
4522                 spin_unlock(&napi_hash_lock);
4523         }
4524 }
4525 EXPORT_SYMBOL_GPL(napi_hash_add);
4526
4527 /* Warning : caller is responsible to make sure rcu grace period
4528  * is respected before freeing memory containing @napi
4529  */
4530 void napi_hash_del(struct napi_struct *napi)
4531 {
4532         spin_lock(&napi_hash_lock);
4533
4534         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4535                 hlist_del_rcu(&napi->napi_hash_node);
4536
4537         spin_unlock(&napi_hash_lock);
4538 }
4539 EXPORT_SYMBOL_GPL(napi_hash_del);
4540
4541 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4542 {
4543         struct napi_struct *napi;
4544
4545         napi = container_of(timer, struct napi_struct, timer);
4546         if (napi->gro_list)
4547                 napi_schedule(napi);
4548
4549         return HRTIMER_NORESTART;
4550 }
4551
4552 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4553                     int (*poll)(struct napi_struct *, int), int weight)
4554 {
4555         INIT_LIST_HEAD(&napi->poll_list);
4556         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4557         napi->timer.function = napi_watchdog;
4558         napi->gro_count = 0;
4559         napi->gro_list = NULL;
4560         napi->skb = NULL;
4561         napi->poll = poll;
4562         if (weight > NAPI_POLL_WEIGHT)
4563                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4564                             weight, dev->name);
4565         napi->weight = weight;
4566         list_add(&napi->dev_list, &dev->napi_list);
4567         napi->dev = dev;
4568 #ifdef CONFIG_NETPOLL
4569         spin_lock_init(&napi->poll_lock);
4570         napi->poll_owner = -1;
4571 #endif
4572         set_bit(NAPI_STATE_SCHED, &napi->state);
4573 }
4574 EXPORT_SYMBOL(netif_napi_add);
4575
4576 void napi_disable(struct napi_struct *n)
4577 {
4578         might_sleep();
4579         set_bit(NAPI_STATE_DISABLE, &n->state);
4580
4581         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4582                 msleep(1);
4583
4584         hrtimer_cancel(&n->timer);
4585
4586         clear_bit(NAPI_STATE_DISABLE, &n->state);
4587 }
4588 EXPORT_SYMBOL(napi_disable);
4589
4590 void netif_napi_del(struct napi_struct *napi)
4591 {
4592         list_del_init(&napi->dev_list);
4593         napi_free_frags(napi);
4594
4595         kfree_skb_list(napi->gro_list);
4596         napi->gro_list = NULL;
4597         napi->gro_count = 0;
4598 }
4599 EXPORT_SYMBOL(netif_napi_del);
4600
4601 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4602 {
4603         void *have;
4604         int work, weight;
4605
4606         list_del_init(&n->poll_list);
4607
4608         have = netpoll_poll_lock(n);
4609
4610         weight = n->weight;
4611
4612         /* This NAPI_STATE_SCHED test is for avoiding a race
4613          * with netpoll's poll_napi().  Only the entity which
4614          * obtains the lock and sees NAPI_STATE_SCHED set will
4615          * actually make the ->poll() call.  Therefore we avoid
4616          * accidentally calling ->poll() when NAPI is not scheduled.
4617          */
4618         work = 0;
4619         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4620                 work = n->poll(n, weight);
4621                 trace_napi_poll(n);
4622         }
4623
4624         WARN_ON_ONCE(work > weight);
4625
4626         if (likely(work < weight))
4627                 goto out_unlock;
4628
4629         /* Drivers must not modify the NAPI state if they
4630          * consume the entire weight.  In such cases this code
4631          * still "owns" the NAPI instance and therefore can
4632          * move the instance around on the list at-will.
4633          */
4634         if (unlikely(napi_disable_pending(n))) {
4635                 napi_complete(n);
4636                 goto out_unlock;
4637         }
4638
4639         if (n->gro_list) {
4640                 /* flush too old packets
4641                  * If HZ < 1000, flush all packets.
4642                  */
4643                 napi_gro_flush(n, HZ >= 1000);
4644         }
4645
4646         /* Some drivers may have called napi_schedule
4647          * prior to exhausting their budget.
4648          */
4649         if (unlikely(!list_empty(&n->poll_list))) {
4650                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4651                              n->dev ? n->dev->name : "backlog");
4652                 goto out_unlock;
4653         }
4654
4655         list_add_tail(&n->poll_list, repoll);
4656
4657 out_unlock:
4658         netpoll_poll_unlock(have);
4659
4660         return work;
4661 }
4662
4663 static void net_rx_action(struct softirq_action *h)
4664 {
4665         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4666         unsigned long time_limit = jiffies + 2;
4667         int budget = netdev_budget;
4668         LIST_HEAD(list);
4669         LIST_HEAD(repoll);
4670
4671         local_irq_disable();
4672         list_splice_init(&sd->poll_list, &list);
4673         local_irq_enable();
4674
4675         for (;;) {
4676                 struct napi_struct *n;
4677
4678                 if (list_empty(&list)) {
4679                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4680                                 return;
4681                         break;
4682                 }
4683
4684                 n = list_first_entry(&list, struct napi_struct, poll_list);
4685                 budget -= napi_poll(n, &repoll);
4686
4687                 /* If softirq window is exhausted then punt.
4688                  * Allow this to run for 2 jiffies since which will allow
4689                  * an average latency of 1.5/HZ.
4690                  */
4691                 if (unlikely(budget <= 0 ||
4692                              time_after_eq(jiffies, time_limit))) {
4693                         sd->time_squeeze++;
4694                         break;
4695                 }
4696         }
4697
4698         local_irq_disable();
4699
4700         list_splice_tail_init(&sd->poll_list, &list);
4701         list_splice_tail(&repoll, &list);
4702         list_splice(&list, &sd->poll_list);
4703         if (!list_empty(&sd->poll_list))
4704                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4705
4706         net_rps_action_and_irq_enable(sd);
4707 }
4708
4709 struct netdev_adjacent {
4710         struct net_device *dev;
4711
4712         /* upper master flag, there can only be one master device per list */
4713         bool master;
4714
4715         /* counter for the number of times this device was added to us */
4716         u16 ref_nr;
4717
4718         /* private field for the users */
4719         void *private;
4720
4721         struct list_head list;
4722         struct rcu_head rcu;
4723 };
4724
4725 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4726                                                  struct net_device *adj_dev,
4727                                                  struct list_head *adj_list)
4728 {
4729         struct netdev_adjacent *adj;
4730
4731         list_for_each_entry(adj, adj_list, list) {
4732                 if (adj->dev == adj_dev)
4733                         return adj;
4734         }
4735         return NULL;
4736 }
4737
4738 /**
4739  * netdev_has_upper_dev - Check if device is linked to an upper device
4740  * @dev: device
4741  * @upper_dev: upper device to check
4742  *
4743  * Find out if a device is linked to specified upper device and return true
4744  * in case it is. Note that this checks only immediate upper device,
4745  * not through a complete stack of devices. The caller must hold the RTNL lock.
4746  */
4747 bool netdev_has_upper_dev(struct net_device *dev,
4748                           struct net_device *upper_dev)
4749 {
4750         ASSERT_RTNL();
4751
4752         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4753 }
4754 EXPORT_SYMBOL(netdev_has_upper_dev);
4755
4756 /**
4757  * netdev_has_any_upper_dev - Check if device is linked to some device
4758  * @dev: device
4759  *
4760  * Find out if a device is linked to an upper device and return true in case
4761  * it is. The caller must hold the RTNL lock.
4762  */
4763 static bool netdev_has_any_upper_dev(struct net_device *dev)
4764 {
4765         ASSERT_RTNL();
4766
4767         return !list_empty(&dev->all_adj_list.upper);
4768 }
4769
4770 /**
4771  * netdev_master_upper_dev_get - Get master upper device
4772  * @dev: device
4773  *
4774  * Find a master upper device and return pointer to it or NULL in case
4775  * it's not there. The caller must hold the RTNL lock.
4776  */
4777 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4778 {
4779         struct netdev_adjacent *upper;
4780
4781         ASSERT_RTNL();
4782
4783         if (list_empty(&dev->adj_list.upper))
4784                 return NULL;
4785
4786         upper = list_first_entry(&dev->adj_list.upper,
4787                                  struct netdev_adjacent, list);
4788         if (likely(upper->master))
4789                 return upper->dev;
4790         return NULL;
4791 }
4792 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4793
4794 void *netdev_adjacent_get_private(struct list_head *adj_list)
4795 {
4796         struct netdev_adjacent *adj;
4797
4798         adj = list_entry(adj_list, struct netdev_adjacent, list);
4799
4800         return adj->private;
4801 }
4802 EXPORT_SYMBOL(netdev_adjacent_get_private);
4803
4804 /**
4805  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4806  * @dev: device
4807  * @iter: list_head ** of the current position
4808  *
4809  * Gets the next device from the dev's upper list, starting from iter
4810  * position. The caller must hold RCU read lock.
4811  */
4812 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4813                                                  struct list_head **iter)
4814 {
4815         struct netdev_adjacent *upper;
4816
4817         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4818
4819         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4820
4821         if (&upper->list == &dev->adj_list.upper)
4822                 return NULL;
4823
4824         *iter = &upper->list;
4825
4826         return upper->dev;
4827 }
4828 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4829
4830 /**
4831  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4832  * @dev: device
4833  * @iter: list_head ** of the current position
4834  *
4835  * Gets the next device from the dev's upper list, starting from iter
4836  * position. The caller must hold RCU read lock.
4837  */
4838 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4839                                                      struct list_head **iter)
4840 {
4841         struct netdev_adjacent *upper;
4842
4843         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4844
4845         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4846
4847         if (&upper->list == &dev->all_adj_list.upper)
4848                 return NULL;
4849
4850         *iter = &upper->list;
4851
4852         return upper->dev;
4853 }
4854 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4855
4856 /**
4857  * netdev_lower_get_next_private - Get the next ->private from the
4858  *                                 lower neighbour list
4859  * @dev: device
4860  * @iter: list_head ** of the current position
4861  *
4862  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4863  * list, starting from iter position. The caller must hold either hold the
4864  * RTNL lock or its own locking that guarantees that the neighbour lower
4865  * list will remain unchainged.
4866  */
4867 void *netdev_lower_get_next_private(struct net_device *dev,
4868                                     struct list_head **iter)
4869 {
4870         struct netdev_adjacent *lower;
4871
4872         lower = list_entry(*iter, struct netdev_adjacent, list);
4873
4874         if (&lower->list == &dev->adj_list.lower)
4875                 return NULL;
4876
4877         *iter = lower->list.next;
4878
4879         return lower->private;
4880 }
4881 EXPORT_SYMBOL(netdev_lower_get_next_private);
4882
4883 /**
4884  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4885  *                                     lower neighbour list, RCU
4886  *                                     variant
4887  * @dev: device
4888  * @iter: list_head ** of the current position
4889  *
4890  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4891  * list, starting from iter position. The caller must hold RCU read lock.
4892  */
4893 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4894                                         struct list_head **iter)
4895 {
4896         struct netdev_adjacent *lower;
4897
4898         WARN_ON_ONCE(!rcu_read_lock_held());
4899
4900         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4901
4902         if (&lower->list == &dev->adj_list.lower)
4903                 return NULL;
4904
4905         *iter = &lower->list;
4906
4907         return lower->private;
4908 }
4909 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4910
4911 /**
4912  * netdev_lower_get_next - Get the next device from the lower neighbour
4913  *                         list
4914  * @dev: device
4915  * @iter: list_head ** of the current position
4916  *
4917  * Gets the next netdev_adjacent from the dev's lower neighbour
4918  * list, starting from iter position. The caller must hold RTNL lock or
4919  * its own locking that guarantees that the neighbour lower
4920  * list will remain unchainged.
4921  */
4922 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4923 {
4924         struct netdev_adjacent *lower;
4925
4926         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4927
4928         if (&lower->list == &dev->adj_list.lower)
4929                 return NULL;
4930
4931         *iter = &lower->list;
4932
4933         return lower->dev;
4934 }
4935 EXPORT_SYMBOL(netdev_lower_get_next);
4936
4937 /**
4938  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4939  *                                     lower neighbour list, RCU
4940  *                                     variant
4941  * @dev: device
4942  *
4943  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4944  * list. The caller must hold RCU read lock.
4945  */
4946 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4947 {
4948         struct netdev_adjacent *lower;
4949
4950         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4951                         struct netdev_adjacent, list);
4952         if (lower)
4953                 return lower->private;
4954         return NULL;
4955 }
4956 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4957
4958 /**
4959  * netdev_master_upper_dev_get_rcu - Get master upper device
4960  * @dev: device
4961  *
4962  * Find a master upper device and return pointer to it or NULL in case
4963  * it's not there. The caller must hold the RCU read lock.
4964  */
4965 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4966 {
4967         struct netdev_adjacent *upper;
4968
4969         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4970                                        struct netdev_adjacent, list);
4971         if (upper && likely(upper->master))
4972                 return upper->dev;
4973         return NULL;
4974 }
4975 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4976
4977 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4978                               struct net_device *adj_dev,
4979                               struct list_head *dev_list)
4980 {
4981         char linkname[IFNAMSIZ+7];
4982         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4983                 "upper_%s" : "lower_%s", adj_dev->name);
4984         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4985                                  linkname);
4986 }
4987 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4988                                char *name,
4989                                struct list_head *dev_list)
4990 {
4991         char linkname[IFNAMSIZ+7];
4992         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4993                 "upper_%s" : "lower_%s", name);
4994         sysfs_remove_link(&(dev->dev.kobj), linkname);
4995 }
4996
4997 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4998                                                  struct net_device *adj_dev,
4999                                                  struct list_head *dev_list)
5000 {
5001         return (dev_list == &dev->adj_list.upper ||
5002                 dev_list == &dev->adj_list.lower) &&
5003                 net_eq(dev_net(dev), dev_net(adj_dev));
5004 }
5005
5006 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5007                                         struct net_device *adj_dev,
5008                                         struct list_head *dev_list,
5009                                         void *private, bool master)
5010 {
5011         struct netdev_adjacent *adj;
5012         int ret;
5013
5014         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5015
5016         if (adj) {
5017                 adj->ref_nr++;
5018                 return 0;
5019         }
5020
5021         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5022         if (!adj)
5023                 return -ENOMEM;
5024
5025         adj->dev = adj_dev;
5026         adj->master = master;
5027         adj->ref_nr = 1;
5028         adj->private = private;
5029         dev_hold(adj_dev);
5030
5031         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5032                  adj_dev->name, dev->name, adj_dev->name);
5033
5034         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5035                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5036                 if (ret)
5037                         goto free_adj;
5038         }
5039
5040         /* Ensure that master link is always the first item in list. */
5041         if (master) {
5042                 ret = sysfs_create_link(&(dev->dev.kobj),
5043                                         &(adj_dev->dev.kobj), "master");
5044                 if (ret)
5045                         goto remove_symlinks;
5046
5047                 list_add_rcu(&adj->list, dev_list);
5048         } else {
5049                 list_add_tail_rcu(&adj->list, dev_list);
5050         }
5051
5052         return 0;
5053
5054 remove_symlinks:
5055         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5056                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5057 free_adj:
5058         kfree(adj);
5059         dev_put(adj_dev);
5060
5061         return ret;
5062 }
5063
5064 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5065                                          struct net_device *adj_dev,
5066                                          struct list_head *dev_list)
5067 {
5068         struct netdev_adjacent *adj;
5069
5070         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5071
5072         if (!adj) {
5073                 pr_err("tried to remove device %s from %s\n",
5074                        dev->name, adj_dev->name);
5075                 BUG();
5076         }
5077
5078         if (adj->ref_nr > 1) {
5079                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5080                          adj->ref_nr-1);
5081                 adj->ref_nr--;
5082                 return;
5083         }
5084
5085         if (adj->master)
5086                 sysfs_remove_link(&(dev->dev.kobj), "master");
5087
5088         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5089                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5090
5091         list_del_rcu(&adj->list);
5092         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5093                  adj_dev->name, dev->name, adj_dev->name);
5094         dev_put(adj_dev);
5095         kfree_rcu(adj, rcu);
5096 }
5097
5098 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5099                                             struct net_device *upper_dev,
5100                                             struct list_head *up_list,
5101                                             struct list_head *down_list,
5102                                             void *private, bool master)
5103 {
5104         int ret;
5105
5106         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5107                                            master);
5108         if (ret)
5109                 return ret;
5110
5111         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5112                                            false);
5113         if (ret) {
5114                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5115                 return ret;
5116         }
5117
5118         return 0;
5119 }
5120
5121 static int __netdev_adjacent_dev_link(struct net_device *dev,
5122                                       struct net_device *upper_dev)
5123 {
5124         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5125                                                 &dev->all_adj_list.upper,
5126                                                 &upper_dev->all_adj_list.lower,
5127                                                 NULL, false);
5128 }
5129
5130 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5131                                                struct net_device *upper_dev,
5132                                                struct list_head *up_list,
5133                                                struct list_head *down_list)
5134 {
5135         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5136         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5137 }
5138
5139 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5140                                          struct net_device *upper_dev)
5141 {
5142         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5143                                            &dev->all_adj_list.upper,
5144                                            &upper_dev->all_adj_list.lower);
5145 }
5146
5147 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5148                                                 struct net_device *upper_dev,
5149                                                 void *private, bool master)
5150 {
5151         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5152
5153         if (ret)
5154                 return ret;
5155
5156         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5157                                                &dev->adj_list.upper,
5158                                                &upper_dev->adj_list.lower,
5159                                                private, master);
5160         if (ret) {
5161                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5162                 return ret;
5163         }
5164
5165         return 0;
5166 }
5167
5168 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5169                                                    struct net_device *upper_dev)
5170 {
5171         __netdev_adjacent_dev_unlink(dev, upper_dev);
5172         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5173                                            &dev->adj_list.upper,
5174                                            &upper_dev->adj_list.lower);
5175 }
5176
5177 static int __netdev_upper_dev_link(struct net_device *dev,
5178                                    struct net_device *upper_dev, bool master,
5179                                    void *private)
5180 {
5181         struct netdev_adjacent *i, *j, *to_i, *to_j;
5182         int ret = 0;
5183
5184         ASSERT_RTNL();
5185
5186         if (dev == upper_dev)
5187                 return -EBUSY;
5188
5189         /* To prevent loops, check if dev is not upper device to upper_dev. */
5190         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5191                 return -EBUSY;
5192
5193         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5194                 return -EEXIST;
5195
5196         if (master && netdev_master_upper_dev_get(dev))
5197                 return -EBUSY;
5198
5199         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5200                                                    master);
5201         if (ret)
5202                 return ret;
5203
5204         /* Now that we linked these devs, make all the upper_dev's
5205          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5206          * versa, and don't forget the devices itself. All of these
5207          * links are non-neighbours.
5208          */
5209         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5210                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5211                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5212                                  i->dev->name, j->dev->name);
5213                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5214                         if (ret)
5215                                 goto rollback_mesh;
5216                 }
5217         }
5218
5219         /* add dev to every upper_dev's upper device */
5220         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5221                 pr_debug("linking %s's upper device %s with %s\n",
5222                          upper_dev->name, i->dev->name, dev->name);
5223                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5224                 if (ret)
5225                         goto rollback_upper_mesh;
5226         }
5227
5228         /* add upper_dev to every dev's lower device */
5229         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5230                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5231                          i->dev->name, upper_dev->name);
5232                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5233                 if (ret)
5234                         goto rollback_lower_mesh;
5235         }
5236
5237         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5238         return 0;
5239
5240 rollback_lower_mesh:
5241         to_i = i;
5242         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5243                 if (i == to_i)
5244                         break;
5245                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5246         }
5247
5248         i = NULL;
5249
5250 rollback_upper_mesh:
5251         to_i = i;
5252         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5253                 if (i == to_i)
5254                         break;
5255                 __netdev_adjacent_dev_unlink(dev, i->dev);
5256         }
5257
5258         i = j = NULL;
5259
5260 rollback_mesh:
5261         to_i = i;
5262         to_j = j;
5263         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5264                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5265                         if (i == to_i && j == to_j)
5266                                 break;
5267                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5268                 }
5269                 if (i == to_i)
5270                         break;
5271         }
5272
5273         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5274
5275         return ret;
5276 }
5277
5278 /**
5279  * netdev_upper_dev_link - Add a link to the upper device
5280  * @dev: device
5281  * @upper_dev: new upper device
5282  *
5283  * Adds a link to device which is upper to this one. The caller must hold
5284  * the RTNL lock. On a failure a negative errno code is returned.
5285  * On success the reference counts are adjusted and the function
5286  * returns zero.
5287  */
5288 int netdev_upper_dev_link(struct net_device *dev,
5289                           struct net_device *upper_dev)
5290 {
5291         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5292 }
5293 EXPORT_SYMBOL(netdev_upper_dev_link);
5294
5295 /**
5296  * netdev_master_upper_dev_link - Add a master link to the upper device
5297  * @dev: device
5298  * @upper_dev: new upper device
5299  *
5300  * Adds a link to device which is upper to this one. In this case, only
5301  * one master upper device can be linked, although other non-master devices
5302  * might be linked as well. The caller must hold the RTNL lock.
5303  * On a failure a negative errno code is returned. On success the reference
5304  * counts are adjusted and the function returns zero.
5305  */
5306 int netdev_master_upper_dev_link(struct net_device *dev,
5307                                  struct net_device *upper_dev)
5308 {
5309         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5310 }
5311 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5312
5313 int netdev_master_upper_dev_link_private(struct net_device *dev,
5314                                          struct net_device *upper_dev,
5315                                          void *private)
5316 {
5317         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5318 }
5319 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5320
5321 /**
5322  * netdev_upper_dev_unlink - Removes a link to upper device
5323  * @dev: device
5324  * @upper_dev: new upper device
5325  *
5326  * Removes a link to device which is upper to this one. The caller must hold
5327  * the RTNL lock.
5328  */
5329 void netdev_upper_dev_unlink(struct net_device *dev,
5330                              struct net_device *upper_dev)
5331 {
5332         struct netdev_adjacent *i, *j;
5333         ASSERT_RTNL();
5334
5335         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5336
5337         /* Here is the tricky part. We must remove all dev's lower
5338          * devices from all upper_dev's upper devices and vice
5339          * versa, to maintain the graph relationship.
5340          */
5341         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5342                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5343                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5344
5345         /* remove also the devices itself from lower/upper device
5346          * list
5347          */
5348         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5349                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5350
5351         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5352                 __netdev_adjacent_dev_unlink(dev, i->dev);
5353
5354         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5355 }
5356 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5357
5358 /**
5359  * netdev_bonding_info_change - Dispatch event about slave change
5360  * @dev: device
5361  * @netdev_bonding_info: info to dispatch
5362  *
5363  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5364  * The caller must hold the RTNL lock.
5365  */
5366 void netdev_bonding_info_change(struct net_device *dev,
5367                                 struct netdev_bonding_info *bonding_info)
5368 {
5369         struct netdev_notifier_bonding_info     info;
5370
5371         memcpy(&info.bonding_info, bonding_info,
5372                sizeof(struct netdev_bonding_info));
5373         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5374                                       &info.info);
5375 }
5376 EXPORT_SYMBOL(netdev_bonding_info_change);
5377
5378 void netdev_adjacent_add_links(struct net_device *dev)
5379 {
5380         struct netdev_adjacent *iter;
5381
5382         struct net *net = dev_net(dev);
5383
5384         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5385                 if (!net_eq(net,dev_net(iter->dev)))
5386                         continue;
5387                 netdev_adjacent_sysfs_add(iter->dev, dev,
5388                                           &iter->dev->adj_list.lower);
5389                 netdev_adjacent_sysfs_add(dev, iter->dev,
5390                                           &dev->adj_list.upper);
5391         }
5392
5393         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5394                 if (!net_eq(net,dev_net(iter->dev)))
5395                         continue;
5396                 netdev_adjacent_sysfs_add(iter->dev, dev,
5397                                           &iter->dev->adj_list.upper);
5398                 netdev_adjacent_sysfs_add(dev, iter->dev,
5399                                           &dev->adj_list.lower);
5400         }
5401 }
5402
5403 void netdev_adjacent_del_links(struct net_device *dev)
5404 {
5405         struct netdev_adjacent *iter;
5406
5407         struct net *net = dev_net(dev);
5408
5409         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5410                 if (!net_eq(net,dev_net(iter->dev)))
5411                         continue;
5412                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5413                                           &iter->dev->adj_list.lower);
5414                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5415                                           &dev->adj_list.upper);
5416         }
5417
5418         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5419                 if (!net_eq(net,dev_net(iter->dev)))
5420                         continue;
5421                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5422                                           &iter->dev->adj_list.upper);
5423                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5424                                           &dev->adj_list.lower);
5425         }
5426 }
5427
5428 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5429 {
5430         struct netdev_adjacent *iter;
5431
5432         struct net *net = dev_net(dev);
5433
5434         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5435                 if (!net_eq(net,dev_net(iter->dev)))
5436                         continue;
5437                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5438                                           &iter->dev->adj_list.lower);
5439                 netdev_adjacent_sysfs_add(iter->dev, dev,
5440                                           &iter->dev->adj_list.lower);
5441         }
5442
5443         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5444                 if (!net_eq(net,dev_net(iter->dev)))
5445                         continue;
5446                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5447                                           &iter->dev->adj_list.upper);
5448                 netdev_adjacent_sysfs_add(iter->dev, dev,
5449                                           &iter->dev->adj_list.upper);
5450         }
5451 }
5452
5453 void *netdev_lower_dev_get_private(struct net_device *dev,
5454                                    struct net_device *lower_dev)
5455 {
5456         struct netdev_adjacent *lower;
5457
5458         if (!lower_dev)
5459                 return NULL;
5460         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5461         if (!lower)
5462                 return NULL;
5463
5464         return lower->private;
5465 }
5466 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5467
5468
5469 int dev_get_nest_level(struct net_device *dev,
5470                        bool (*type_check)(struct net_device *dev))
5471 {
5472         struct net_device *lower = NULL;
5473         struct list_head *iter;
5474         int max_nest = -1;
5475         int nest;
5476
5477         ASSERT_RTNL();
5478
5479         netdev_for_each_lower_dev(dev, lower, iter) {
5480                 nest = dev_get_nest_level(lower, type_check);
5481                 if (max_nest < nest)
5482                         max_nest = nest;
5483         }
5484
5485         if (type_check(dev))
5486                 max_nest++;
5487
5488         return max_nest;
5489 }
5490 EXPORT_SYMBOL(dev_get_nest_level);
5491
5492 static void dev_change_rx_flags(struct net_device *dev, int flags)
5493 {
5494         const struct net_device_ops *ops = dev->netdev_ops;
5495
5496         if (ops->ndo_change_rx_flags)
5497                 ops->ndo_change_rx_flags(dev, flags);
5498 }
5499
5500 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5501 {
5502         unsigned int old_flags = dev->flags;
5503         kuid_t uid;
5504         kgid_t gid;
5505
5506         ASSERT_RTNL();
5507
5508         dev->flags |= IFF_PROMISC;
5509         dev->promiscuity += inc;
5510         if (dev->promiscuity == 0) {
5511                 /*
5512                  * Avoid overflow.
5513                  * If inc causes overflow, untouch promisc and return error.
5514                  */
5515                 if (inc < 0)
5516                         dev->flags &= ~IFF_PROMISC;
5517                 else {
5518                         dev->promiscuity -= inc;
5519                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5520                                 dev->name);
5521                         return -EOVERFLOW;
5522                 }
5523         }
5524         if (dev->flags != old_flags) {
5525                 pr_info("device %s %s promiscuous mode\n",
5526                         dev->name,
5527                         dev->flags & IFF_PROMISC ? "entered" : "left");
5528                 if (audit_enabled) {
5529                         current_uid_gid(&uid, &gid);
5530                         audit_log(current->audit_context, GFP_ATOMIC,
5531                                 AUDIT_ANOM_PROMISCUOUS,
5532                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5533                                 dev->name, (dev->flags & IFF_PROMISC),
5534                                 (old_flags & IFF_PROMISC),
5535                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5536                                 from_kuid(&init_user_ns, uid),
5537                                 from_kgid(&init_user_ns, gid),
5538                                 audit_get_sessionid(current));
5539                 }
5540
5541                 dev_change_rx_flags(dev, IFF_PROMISC);
5542         }
5543         if (notify)
5544                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5545         return 0;
5546 }
5547
5548 /**
5549  *      dev_set_promiscuity     - update promiscuity count on a device
5550  *      @dev: device
5551  *      @inc: modifier
5552  *
5553  *      Add or remove promiscuity from a device. While the count in the device
5554  *      remains above zero the interface remains promiscuous. Once it hits zero
5555  *      the device reverts back to normal filtering operation. A negative inc
5556  *      value is used to drop promiscuity on the device.
5557  *      Return 0 if successful or a negative errno code on error.
5558  */
5559 int dev_set_promiscuity(struct net_device *dev, int inc)
5560 {
5561         unsigned int old_flags = dev->flags;
5562         int err;
5563
5564         err = __dev_set_promiscuity(dev, inc, true);
5565         if (err < 0)
5566                 return err;
5567         if (dev->flags != old_flags)
5568                 dev_set_rx_mode(dev);
5569         return err;
5570 }
5571 EXPORT_SYMBOL(dev_set_promiscuity);
5572
5573 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5574 {
5575         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5576
5577         ASSERT_RTNL();
5578
5579         dev->flags |= IFF_ALLMULTI;
5580         dev->allmulti += inc;
5581         if (dev->allmulti == 0) {
5582                 /*
5583                  * Avoid overflow.
5584                  * If inc causes overflow, untouch allmulti and return error.
5585                  */
5586                 if (inc < 0)
5587                         dev->flags &= ~IFF_ALLMULTI;
5588                 else {
5589                         dev->allmulti -= inc;
5590                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5591                                 dev->name);
5592                         return -EOVERFLOW;
5593                 }
5594         }
5595         if (dev->flags ^ old_flags) {
5596                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5597                 dev_set_rx_mode(dev);
5598                 if (notify)
5599                         __dev_notify_flags(dev, old_flags,
5600                                            dev->gflags ^ old_gflags);
5601         }
5602         return 0;
5603 }
5604
5605 /**
5606  *      dev_set_allmulti        - update allmulti count on a device
5607  *      @dev: device
5608  *      @inc: modifier
5609  *
5610  *      Add or remove reception of all multicast frames to a device. While the
5611  *      count in the device remains above zero the interface remains listening
5612  *      to all interfaces. Once it hits zero the device reverts back to normal
5613  *      filtering operation. A negative @inc value is used to drop the counter
5614  *      when releasing a resource needing all multicasts.
5615  *      Return 0 if successful or a negative errno code on error.
5616  */
5617
5618 int dev_set_allmulti(struct net_device *dev, int inc)
5619 {
5620         return __dev_set_allmulti(dev, inc, true);
5621 }
5622 EXPORT_SYMBOL(dev_set_allmulti);
5623
5624 /*
5625  *      Upload unicast and multicast address lists to device and
5626  *      configure RX filtering. When the device doesn't support unicast
5627  *      filtering it is put in promiscuous mode while unicast addresses
5628  *      are present.
5629  */
5630 void __dev_set_rx_mode(struct net_device *dev)
5631 {
5632         const struct net_device_ops *ops = dev->netdev_ops;
5633
5634         /* dev_open will call this function so the list will stay sane. */
5635         if (!(dev->flags&IFF_UP))
5636                 return;
5637
5638         if (!netif_device_present(dev))
5639                 return;
5640
5641         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5642                 /* Unicast addresses changes may only happen under the rtnl,
5643                  * therefore calling __dev_set_promiscuity here is safe.
5644                  */
5645                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5646                         __dev_set_promiscuity(dev, 1, false);
5647                         dev->uc_promisc = true;
5648                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5649                         __dev_set_promiscuity(dev, -1, false);
5650                         dev->uc_promisc = false;
5651                 }
5652         }
5653
5654         if (ops->ndo_set_rx_mode)
5655                 ops->ndo_set_rx_mode(dev);
5656 }
5657
5658 void dev_set_rx_mode(struct net_device *dev)
5659 {
5660         netif_addr_lock_bh(dev);
5661         __dev_set_rx_mode(dev);
5662         netif_addr_unlock_bh(dev);
5663 }
5664
5665 /**
5666  *      dev_get_flags - get flags reported to userspace
5667  *      @dev: device
5668  *
5669  *      Get the combination of flag bits exported through APIs to userspace.
5670  */
5671 unsigned int dev_get_flags(const struct net_device *dev)
5672 {
5673         unsigned int flags;
5674
5675         flags = (dev->flags & ~(IFF_PROMISC |
5676                                 IFF_ALLMULTI |
5677                                 IFF_RUNNING |
5678                                 IFF_LOWER_UP |
5679                                 IFF_DORMANT)) |
5680                 (dev->gflags & (IFF_PROMISC |
5681                                 IFF_ALLMULTI));
5682
5683         if (netif_running(dev)) {
5684                 if (netif_oper_up(dev))
5685                         flags |= IFF_RUNNING;
5686                 if (netif_carrier_ok(dev))
5687                         flags |= IFF_LOWER_UP;
5688                 if (netif_dormant(dev))
5689                         flags |= IFF_DORMANT;
5690         }
5691
5692         return flags;
5693 }
5694 EXPORT_SYMBOL(dev_get_flags);
5695
5696 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5697 {
5698         unsigned int old_flags = dev->flags;
5699         int ret;
5700
5701         ASSERT_RTNL();
5702
5703         /*
5704          *      Set the flags on our device.
5705          */
5706
5707         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5708                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5709                                IFF_AUTOMEDIA)) |
5710                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5711                                     IFF_ALLMULTI));
5712
5713         /*
5714          *      Load in the correct multicast list now the flags have changed.
5715          */
5716
5717         if ((old_flags ^ flags) & IFF_MULTICAST)
5718                 dev_change_rx_flags(dev, IFF_MULTICAST);
5719
5720         dev_set_rx_mode(dev);
5721
5722         /*
5723          *      Have we downed the interface. We handle IFF_UP ourselves
5724          *      according to user attempts to set it, rather than blindly
5725          *      setting it.
5726          */
5727
5728         ret = 0;
5729         if ((old_flags ^ flags) & IFF_UP)
5730                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5731
5732         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5733                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5734                 unsigned int old_flags = dev->flags;
5735
5736                 dev->gflags ^= IFF_PROMISC;
5737
5738                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5739                         if (dev->flags != old_flags)
5740                                 dev_set_rx_mode(dev);
5741         }
5742
5743         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5744            is important. Some (broken) drivers set IFF_PROMISC, when
5745            IFF_ALLMULTI is requested not asking us and not reporting.
5746          */
5747         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5748                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5749
5750                 dev->gflags ^= IFF_ALLMULTI;
5751                 __dev_set_allmulti(dev, inc, false);
5752         }
5753
5754         return ret;
5755 }
5756
5757 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5758                         unsigned int gchanges)
5759 {
5760         unsigned int changes = dev->flags ^ old_flags;
5761
5762         if (gchanges)
5763                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5764
5765         if (changes & IFF_UP) {
5766                 if (dev->flags & IFF_UP)
5767                         call_netdevice_notifiers(NETDEV_UP, dev);
5768                 else
5769                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5770         }
5771
5772         if (dev->flags & IFF_UP &&
5773             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5774                 struct netdev_notifier_change_info change_info;
5775
5776                 change_info.flags_changed = changes;
5777                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5778                                               &change_info.info);
5779         }
5780 }
5781
5782 /**
5783  *      dev_change_flags - change device settings
5784  *      @dev: device
5785  *      @flags: device state flags
5786  *
5787  *      Change settings on device based state flags. The flags are
5788  *      in the userspace exported format.
5789  */
5790 int dev_change_flags(struct net_device *dev, unsigned int flags)
5791 {
5792         int ret;
5793         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5794
5795         ret = __dev_change_flags(dev, flags);
5796         if (ret < 0)
5797                 return ret;
5798
5799         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5800         __dev_notify_flags(dev, old_flags, changes);
5801         return ret;
5802 }
5803 EXPORT_SYMBOL(dev_change_flags);
5804
5805 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5806 {
5807         const struct net_device_ops *ops = dev->netdev_ops;
5808
5809         if (ops->ndo_change_mtu)
5810                 return ops->ndo_change_mtu(dev, new_mtu);
5811
5812         dev->mtu = new_mtu;
5813         return 0;
5814 }
5815
5816 /**
5817  *      dev_set_mtu - Change maximum transfer unit
5818  *      @dev: device
5819  *      @new_mtu: new transfer unit
5820  *
5821  *      Change the maximum transfer size of the network device.
5822  */
5823 int dev_set_mtu(struct net_device *dev, int new_mtu)
5824 {
5825         int err, orig_mtu;
5826
5827         if (new_mtu == dev->mtu)
5828                 return 0;
5829
5830         /*      MTU must be positive.    */
5831         if (new_mtu < 0)
5832                 return -EINVAL;
5833
5834         if (!netif_device_present(dev))
5835                 return -ENODEV;
5836
5837         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5838         err = notifier_to_errno(err);
5839         if (err)
5840                 return err;
5841
5842         orig_mtu = dev->mtu;
5843         err = __dev_set_mtu(dev, new_mtu);
5844
5845         if (!err) {
5846                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5847                 err = notifier_to_errno(err);
5848                 if (err) {
5849                         /* setting mtu back and notifying everyone again,
5850                          * so that they have a chance to revert changes.
5851                          */
5852                         __dev_set_mtu(dev, orig_mtu);
5853                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5854                 }
5855         }
5856         return err;
5857 }
5858 EXPORT_SYMBOL(dev_set_mtu);
5859
5860 /**
5861  *      dev_set_group - Change group this device belongs to
5862  *      @dev: device
5863  *      @new_group: group this device should belong to
5864  */
5865 void dev_set_group(struct net_device *dev, int new_group)
5866 {
5867         dev->group = new_group;
5868 }
5869 EXPORT_SYMBOL(dev_set_group);
5870
5871 /**
5872  *      dev_set_mac_address - Change Media Access Control Address
5873  *      @dev: device
5874  *      @sa: new address
5875  *
5876  *      Change the hardware (MAC) address of the device
5877  */
5878 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5879 {
5880         const struct net_device_ops *ops = dev->netdev_ops;
5881         int err;
5882
5883         if (!ops->ndo_set_mac_address)
5884                 return -EOPNOTSUPP;
5885         if (sa->sa_family != dev->type)
5886                 return -EINVAL;
5887         if (!netif_device_present(dev))
5888                 return -ENODEV;
5889         err = ops->ndo_set_mac_address(dev, sa);
5890         if (err)
5891                 return err;
5892         dev->addr_assign_type = NET_ADDR_SET;
5893         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5894         add_device_randomness(dev->dev_addr, dev->addr_len);
5895         return 0;
5896 }
5897 EXPORT_SYMBOL(dev_set_mac_address);
5898
5899 /**
5900  *      dev_change_carrier - Change device carrier
5901  *      @dev: device
5902  *      @new_carrier: new value
5903  *
5904  *      Change device carrier
5905  */
5906 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5907 {
5908         const struct net_device_ops *ops = dev->netdev_ops;
5909
5910         if (!ops->ndo_change_carrier)
5911                 return -EOPNOTSUPP;
5912         if (!netif_device_present(dev))
5913                 return -ENODEV;
5914         return ops->ndo_change_carrier(dev, new_carrier);
5915 }
5916 EXPORT_SYMBOL(dev_change_carrier);
5917
5918 /**
5919  *      dev_get_phys_port_id - Get device physical port ID
5920  *      @dev: device
5921  *      @ppid: port ID
5922  *
5923  *      Get device physical port ID
5924  */
5925 int dev_get_phys_port_id(struct net_device *dev,
5926                          struct netdev_phys_item_id *ppid)
5927 {
5928         const struct net_device_ops *ops = dev->netdev_ops;
5929
5930         if (!ops->ndo_get_phys_port_id)
5931                 return -EOPNOTSUPP;
5932         return ops->ndo_get_phys_port_id(dev, ppid);
5933 }
5934 EXPORT_SYMBOL(dev_get_phys_port_id);
5935
5936 /**
5937  *      dev_new_index   -       allocate an ifindex
5938  *      @net: the applicable net namespace
5939  *
5940  *      Returns a suitable unique value for a new device interface
5941  *      number.  The caller must hold the rtnl semaphore or the
5942  *      dev_base_lock to be sure it remains unique.
5943  */
5944 static int dev_new_index(struct net *net)
5945 {
5946         int ifindex = net->ifindex;
5947         for (;;) {
5948                 if (++ifindex <= 0)
5949                         ifindex = 1;
5950                 if (!__dev_get_by_index(net, ifindex))
5951                         return net->ifindex = ifindex;
5952         }
5953 }
5954
5955 /* Delayed registration/unregisteration */
5956 static LIST_HEAD(net_todo_list);
5957 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5958
5959 static void net_set_todo(struct net_device *dev)
5960 {
5961         list_add_tail(&dev->todo_list, &net_todo_list);
5962         dev_net(dev)->dev_unreg_count++;
5963 }
5964
5965 static void rollback_registered_many(struct list_head *head)
5966 {
5967         struct net_device *dev, *tmp;
5968         LIST_HEAD(close_head);
5969
5970         BUG_ON(dev_boot_phase);
5971         ASSERT_RTNL();
5972
5973         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5974                 /* Some devices call without registering
5975                  * for initialization unwind. Remove those
5976                  * devices and proceed with the remaining.
5977                  */
5978                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5979                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5980                                  dev->name, dev);
5981
5982                         WARN_ON(1);
5983                         list_del(&dev->unreg_list);
5984                         continue;
5985                 }
5986                 dev->dismantle = true;
5987                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5988         }
5989
5990         /* If device is running, close it first. */
5991         list_for_each_entry(dev, head, unreg_list)
5992                 list_add_tail(&dev->close_list, &close_head);
5993         dev_close_many(&close_head);
5994
5995         list_for_each_entry(dev, head, unreg_list) {
5996                 /* And unlink it from device chain. */
5997                 unlist_netdevice(dev);
5998
5999                 dev->reg_state = NETREG_UNREGISTERING;
6000         }
6001
6002         synchronize_net();
6003
6004         list_for_each_entry(dev, head, unreg_list) {
6005                 struct sk_buff *skb = NULL;
6006
6007                 /* Shutdown queueing discipline. */
6008                 dev_shutdown(dev);
6009
6010
6011                 /* Notify protocols, that we are about to destroy
6012                    this device. They should clean all the things.
6013                 */
6014                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6015
6016                 if (!dev->rtnl_link_ops ||
6017                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6018                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6019                                                      GFP_KERNEL);
6020
6021                 /*
6022                  *      Flush the unicast and multicast chains
6023                  */
6024                 dev_uc_flush(dev);
6025                 dev_mc_flush(dev);
6026
6027                 if (dev->netdev_ops->ndo_uninit)
6028                         dev->netdev_ops->ndo_uninit(dev);
6029
6030                 if (skb)
6031                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6032
6033                 /* Notifier chain MUST detach us all upper devices. */
6034                 WARN_ON(netdev_has_any_upper_dev(dev));
6035
6036                 /* Remove entries from kobject tree */
6037                 netdev_unregister_kobject(dev);
6038 #ifdef CONFIG_XPS
6039                 /* Remove XPS queueing entries */
6040                 netif_reset_xps_queues_gt(dev, 0);
6041 #endif
6042         }
6043
6044         synchronize_net();
6045
6046         list_for_each_entry(dev, head, unreg_list)
6047                 dev_put(dev);
6048 }
6049
6050 static void rollback_registered(struct net_device *dev)
6051 {
6052         LIST_HEAD(single);
6053
6054         list_add(&dev->unreg_list, &single);
6055         rollback_registered_many(&single);
6056         list_del(&single);
6057 }
6058
6059 static netdev_features_t netdev_fix_features(struct net_device *dev,
6060         netdev_features_t features)
6061 {
6062         /* Fix illegal checksum combinations */
6063         if ((features & NETIF_F_HW_CSUM) &&
6064             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6065                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6066                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6067         }
6068
6069         /* TSO requires that SG is present as well. */
6070         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6071                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6072                 features &= ~NETIF_F_ALL_TSO;
6073         }
6074
6075         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6076                                         !(features & NETIF_F_IP_CSUM)) {
6077                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6078                 features &= ~NETIF_F_TSO;
6079                 features &= ~NETIF_F_TSO_ECN;
6080         }
6081
6082         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6083                                          !(features & NETIF_F_IPV6_CSUM)) {
6084                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6085                 features &= ~NETIF_F_TSO6;
6086         }
6087
6088         /* TSO ECN requires that TSO is present as well. */
6089         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6090                 features &= ~NETIF_F_TSO_ECN;
6091
6092         /* Software GSO depends on SG. */
6093         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6094                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6095                 features &= ~NETIF_F_GSO;
6096         }
6097
6098         /* UFO needs SG and checksumming */
6099         if (features & NETIF_F_UFO) {
6100                 /* maybe split UFO into V4 and V6? */
6101                 if (!((features & NETIF_F_GEN_CSUM) ||
6102                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6103                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6104                         netdev_dbg(dev,
6105                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6106                         features &= ~NETIF_F_UFO;
6107                 }
6108
6109                 if (!(features & NETIF_F_SG)) {
6110                         netdev_dbg(dev,
6111                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6112                         features &= ~NETIF_F_UFO;
6113                 }
6114         }
6115
6116 #ifdef CONFIG_NET_RX_BUSY_POLL
6117         if (dev->netdev_ops->ndo_busy_poll)
6118                 features |= NETIF_F_BUSY_POLL;
6119         else
6120 #endif
6121                 features &= ~NETIF_F_BUSY_POLL;
6122
6123         return features;
6124 }
6125
6126 int __netdev_update_features(struct net_device *dev)
6127 {
6128         netdev_features_t features;
6129         int err = 0;
6130
6131         ASSERT_RTNL();
6132
6133         features = netdev_get_wanted_features(dev);
6134
6135         if (dev->netdev_ops->ndo_fix_features)
6136                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6137
6138         /* driver might be less strict about feature dependencies */
6139         features = netdev_fix_features(dev, features);
6140
6141         if (dev->features == features)
6142                 return 0;
6143
6144         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6145                 &dev->features, &features);
6146
6147         if (dev->netdev_ops->ndo_set_features)
6148                 err = dev->netdev_ops->ndo_set_features(dev, features);
6149
6150         if (unlikely(err < 0)) {
6151                 netdev_err(dev,
6152                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6153                         err, &features, &dev->features);
6154                 return -1;
6155         }
6156
6157         if (!err)
6158                 dev->features = features;
6159
6160         return 1;
6161 }
6162
6163 /**
6164  *      netdev_update_features - recalculate device features
6165  *      @dev: the device to check
6166  *
6167  *      Recalculate dev->features set and send notifications if it
6168  *      has changed. Should be called after driver or hardware dependent
6169  *      conditions might have changed that influence the features.
6170  */
6171 void netdev_update_features(struct net_device *dev)
6172 {
6173         if (__netdev_update_features(dev))
6174                 netdev_features_change(dev);
6175 }
6176 EXPORT_SYMBOL(netdev_update_features);
6177
6178 /**
6179  *      netdev_change_features - recalculate device features
6180  *      @dev: the device to check
6181  *
6182  *      Recalculate dev->features set and send notifications even
6183  *      if they have not changed. Should be called instead of
6184  *      netdev_update_features() if also dev->vlan_features might
6185  *      have changed to allow the changes to be propagated to stacked
6186  *      VLAN devices.
6187  */
6188 void netdev_change_features(struct net_device *dev)
6189 {
6190         __netdev_update_features(dev);
6191         netdev_features_change(dev);
6192 }
6193 EXPORT_SYMBOL(netdev_change_features);
6194
6195 /**
6196  *      netif_stacked_transfer_operstate -      transfer operstate
6197  *      @rootdev: the root or lower level device to transfer state from
6198  *      @dev: the device to transfer operstate to
6199  *
6200  *      Transfer operational state from root to device. This is normally
6201  *      called when a stacking relationship exists between the root
6202  *      device and the device(a leaf device).
6203  */
6204 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6205                                         struct net_device *dev)
6206 {
6207         if (rootdev->operstate == IF_OPER_DORMANT)
6208                 netif_dormant_on(dev);
6209         else
6210                 netif_dormant_off(dev);
6211
6212         if (netif_carrier_ok(rootdev)) {
6213                 if (!netif_carrier_ok(dev))
6214                         netif_carrier_on(dev);
6215         } else {
6216                 if (netif_carrier_ok(dev))
6217                         netif_carrier_off(dev);
6218         }
6219 }
6220 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6221
6222 #ifdef CONFIG_SYSFS
6223 static int netif_alloc_rx_queues(struct net_device *dev)
6224 {
6225         unsigned int i, count = dev->num_rx_queues;
6226         struct netdev_rx_queue *rx;
6227         size_t sz = count * sizeof(*rx);
6228
6229         BUG_ON(count < 1);
6230
6231         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6232         if (!rx) {
6233                 rx = vzalloc(sz);
6234                 if (!rx)
6235                         return -ENOMEM;
6236         }
6237         dev->_rx = rx;
6238
6239         for (i = 0; i < count; i++)
6240                 rx[i].dev = dev;
6241         return 0;
6242 }
6243 #endif
6244
6245 static void netdev_init_one_queue(struct net_device *dev,
6246                                   struct netdev_queue *queue, void *_unused)
6247 {
6248         /* Initialize queue lock */
6249         spin_lock_init(&queue->_xmit_lock);
6250         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6251         queue->xmit_lock_owner = -1;
6252         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6253         queue->dev = dev;
6254 #ifdef CONFIG_BQL
6255         dql_init(&queue->dql, HZ);
6256 #endif
6257 }
6258
6259 static void netif_free_tx_queues(struct net_device *dev)
6260 {
6261         kvfree(dev->_tx);
6262 }
6263
6264 static int netif_alloc_netdev_queues(struct net_device *dev)
6265 {
6266         unsigned int count = dev->num_tx_queues;
6267         struct netdev_queue *tx;
6268         size_t sz = count * sizeof(*tx);
6269
6270         BUG_ON(count < 1 || count > 0xffff);
6271
6272         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6273         if (!tx) {
6274                 tx = vzalloc(sz);
6275                 if (!tx)
6276                         return -ENOMEM;
6277         }
6278         dev->_tx = tx;
6279
6280         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6281         spin_lock_init(&dev->tx_global_lock);
6282
6283         return 0;
6284 }
6285
6286 /**
6287  *      register_netdevice      - register a network device
6288  *      @dev: device to register
6289  *
6290  *      Take a completed network device structure and add it to the kernel
6291  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6292  *      chain. 0 is returned on success. A negative errno code is returned
6293  *      on a failure to set up the device, or if the name is a duplicate.
6294  *
6295  *      Callers must hold the rtnl semaphore. You may want
6296  *      register_netdev() instead of this.
6297  *
6298  *      BUGS:
6299  *      The locking appears insufficient to guarantee two parallel registers
6300  *      will not get the same name.
6301  */
6302
6303 int register_netdevice(struct net_device *dev)
6304 {
6305         int ret;
6306         struct net *net = dev_net(dev);
6307
6308         BUG_ON(dev_boot_phase);
6309         ASSERT_RTNL();
6310
6311         might_sleep();
6312
6313         /* When net_device's are persistent, this will be fatal. */
6314         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6315         BUG_ON(!net);
6316
6317         spin_lock_init(&dev->addr_list_lock);
6318         netdev_set_addr_lockdep_class(dev);
6319
6320         dev->iflink = -1;
6321
6322         ret = dev_get_valid_name(net, dev, dev->name);
6323         if (ret < 0)
6324                 goto out;
6325
6326         /* Init, if this function is available */
6327         if (dev->netdev_ops->ndo_init) {
6328                 ret = dev->netdev_ops->ndo_init(dev);
6329                 if (ret) {
6330                         if (ret > 0)
6331                                 ret = -EIO;
6332                         goto out;
6333                 }
6334         }
6335
6336         if (((dev->hw_features | dev->features) &
6337              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6338             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6339              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6340                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6341                 ret = -EINVAL;
6342                 goto err_uninit;
6343         }
6344
6345         ret = -EBUSY;
6346         if (!dev->ifindex)
6347                 dev->ifindex = dev_new_index(net);
6348         else if (__dev_get_by_index(net, dev->ifindex))
6349                 goto err_uninit;
6350
6351         if (dev->iflink == -1)
6352                 dev->iflink = dev->ifindex;
6353
6354         /* Transfer changeable features to wanted_features and enable
6355          * software offloads (GSO and GRO).
6356          */
6357         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6358         dev->features |= NETIF_F_SOFT_FEATURES;
6359         dev->wanted_features = dev->features & dev->hw_features;
6360
6361         if (!(dev->flags & IFF_LOOPBACK)) {
6362                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6363         }
6364
6365         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6366          */
6367         dev->vlan_features |= NETIF_F_HIGHDMA;
6368
6369         /* Make NETIF_F_SG inheritable to tunnel devices.
6370          */
6371         dev->hw_enc_features |= NETIF_F_SG;
6372
6373         /* Make NETIF_F_SG inheritable to MPLS.
6374          */
6375         dev->mpls_features |= NETIF_F_SG;
6376
6377         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6378         ret = notifier_to_errno(ret);
6379         if (ret)
6380                 goto err_uninit;
6381
6382         ret = netdev_register_kobject(dev);
6383         if (ret)
6384                 goto err_uninit;
6385         dev->reg_state = NETREG_REGISTERED;
6386
6387         __netdev_update_features(dev);
6388
6389         /*
6390          *      Default initial state at registry is that the
6391          *      device is present.
6392          */
6393
6394         set_bit(__LINK_STATE_PRESENT, &dev->state);
6395
6396         linkwatch_init_dev(dev);
6397
6398         dev_init_scheduler(dev);
6399         dev_hold(dev);
6400         list_netdevice(dev);
6401         add_device_randomness(dev->dev_addr, dev->addr_len);
6402
6403         /* If the device has permanent device address, driver should
6404          * set dev_addr and also addr_assign_type should be set to
6405          * NET_ADDR_PERM (default value).
6406          */
6407         if (dev->addr_assign_type == NET_ADDR_PERM)
6408                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6409
6410         /* Notify protocols, that a new device appeared. */
6411         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6412         ret = notifier_to_errno(ret);
6413         if (ret) {
6414                 rollback_registered(dev);
6415                 dev->reg_state = NETREG_UNREGISTERED;
6416         }
6417         /*
6418          *      Prevent userspace races by waiting until the network
6419          *      device is fully setup before sending notifications.
6420          */
6421         if (!dev->rtnl_link_ops ||
6422             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6423                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6424
6425 out:
6426         return ret;
6427
6428 err_uninit:
6429         if (dev->netdev_ops->ndo_uninit)
6430                 dev->netdev_ops->ndo_uninit(dev);
6431         goto out;
6432 }
6433 EXPORT_SYMBOL(register_netdevice);
6434
6435 /**
6436  *      init_dummy_netdev       - init a dummy network device for NAPI
6437  *      @dev: device to init
6438  *
6439  *      This takes a network device structure and initialize the minimum
6440  *      amount of fields so it can be used to schedule NAPI polls without
6441  *      registering a full blown interface. This is to be used by drivers
6442  *      that need to tie several hardware interfaces to a single NAPI
6443  *      poll scheduler due to HW limitations.
6444  */
6445 int init_dummy_netdev(struct net_device *dev)
6446 {
6447         /* Clear everything. Note we don't initialize spinlocks
6448          * are they aren't supposed to be taken by any of the
6449          * NAPI code and this dummy netdev is supposed to be
6450          * only ever used for NAPI polls
6451          */
6452         memset(dev, 0, sizeof(struct net_device));
6453
6454         /* make sure we BUG if trying to hit standard
6455          * register/unregister code path
6456          */
6457         dev->reg_state = NETREG_DUMMY;
6458
6459         /* NAPI wants this */
6460         INIT_LIST_HEAD(&dev->napi_list);
6461
6462         /* a dummy interface is started by default */
6463         set_bit(__LINK_STATE_PRESENT, &dev->state);
6464         set_bit(__LINK_STATE_START, &dev->state);
6465
6466         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6467          * because users of this 'device' dont need to change
6468          * its refcount.
6469          */
6470
6471         return 0;
6472 }
6473 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6474
6475
6476 /**
6477  *      register_netdev - register a network device
6478  *      @dev: device to register
6479  *
6480  *      Take a completed network device structure and add it to the kernel
6481  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6482  *      chain. 0 is returned on success. A negative errno code is returned
6483  *      on a failure to set up the device, or if the name is a duplicate.
6484  *
6485  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6486  *      and expands the device name if you passed a format string to
6487  *      alloc_netdev.
6488  */
6489 int register_netdev(struct net_device *dev)
6490 {
6491         int err;
6492
6493         rtnl_lock();
6494         err = register_netdevice(dev);
6495         rtnl_unlock();
6496         return err;
6497 }
6498 EXPORT_SYMBOL(register_netdev);
6499
6500 int netdev_refcnt_read(const struct net_device *dev)
6501 {
6502         int i, refcnt = 0;
6503
6504         for_each_possible_cpu(i)
6505                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6506         return refcnt;
6507 }
6508 EXPORT_SYMBOL(netdev_refcnt_read);
6509
6510 /**
6511  * netdev_wait_allrefs - wait until all references are gone.
6512  * @dev: target net_device
6513  *
6514  * This is called when unregistering network devices.
6515  *
6516  * Any protocol or device that holds a reference should register
6517  * for netdevice notification, and cleanup and put back the
6518  * reference if they receive an UNREGISTER event.
6519  * We can get stuck here if buggy protocols don't correctly
6520  * call dev_put.
6521  */
6522 static void netdev_wait_allrefs(struct net_device *dev)
6523 {
6524         unsigned long rebroadcast_time, warning_time;
6525         int refcnt;
6526
6527         linkwatch_forget_dev(dev);
6528
6529         rebroadcast_time = warning_time = jiffies;
6530         refcnt = netdev_refcnt_read(dev);
6531
6532         while (refcnt != 0) {
6533                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6534                         rtnl_lock();
6535
6536                         /* Rebroadcast unregister notification */
6537                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6538
6539                         __rtnl_unlock();
6540                         rcu_barrier();
6541                         rtnl_lock();
6542
6543                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6544                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6545                                      &dev->state)) {
6546                                 /* We must not have linkwatch events
6547                                  * pending on unregister. If this
6548                                  * happens, we simply run the queue
6549                                  * unscheduled, resulting in a noop
6550                                  * for this device.
6551                                  */
6552                                 linkwatch_run_queue();
6553                         }
6554
6555                         __rtnl_unlock();
6556
6557                         rebroadcast_time = jiffies;
6558                 }
6559
6560                 msleep(250);
6561
6562                 refcnt = netdev_refcnt_read(dev);
6563
6564                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6565                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6566                                  dev->name, refcnt);
6567                         warning_time = jiffies;
6568                 }
6569         }
6570 }
6571
6572 /* The sequence is:
6573  *
6574  *      rtnl_lock();
6575  *      ...
6576  *      register_netdevice(x1);
6577  *      register_netdevice(x2);
6578  *      ...
6579  *      unregister_netdevice(y1);
6580  *      unregister_netdevice(y2);
6581  *      ...
6582  *      rtnl_unlock();
6583  *      free_netdev(y1);
6584  *      free_netdev(y2);
6585  *
6586  * We are invoked by rtnl_unlock().
6587  * This allows us to deal with problems:
6588  * 1) We can delete sysfs objects which invoke hotplug
6589  *    without deadlocking with linkwatch via keventd.
6590  * 2) Since we run with the RTNL semaphore not held, we can sleep
6591  *    safely in order to wait for the netdev refcnt to drop to zero.
6592  *
6593  * We must not return until all unregister events added during
6594  * the interval the lock was held have been completed.
6595  */
6596 void netdev_run_todo(void)
6597 {
6598         struct list_head list;
6599
6600         /* Snapshot list, allow later requests */
6601         list_replace_init(&net_todo_list, &list);
6602
6603         __rtnl_unlock();
6604
6605
6606         /* Wait for rcu callbacks to finish before next phase */
6607         if (!list_empty(&list))
6608                 rcu_barrier();
6609
6610         while (!list_empty(&list)) {
6611                 struct net_device *dev
6612                         = list_first_entry(&list, struct net_device, todo_list);
6613                 list_del(&dev->todo_list);
6614
6615                 rtnl_lock();
6616                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6617                 __rtnl_unlock();
6618
6619                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6620                         pr_err("network todo '%s' but state %d\n",
6621                                dev->name, dev->reg_state);
6622                         dump_stack();
6623                         continue;
6624                 }
6625
6626                 dev->reg_state = NETREG_UNREGISTERED;
6627
6628                 on_each_cpu(flush_backlog, dev, 1);
6629
6630                 netdev_wait_allrefs(dev);
6631
6632                 /* paranoia */
6633                 BUG_ON(netdev_refcnt_read(dev));
6634                 BUG_ON(!list_empty(&dev->ptype_all));
6635                 BUG_ON(!list_empty(&dev->ptype_specific));
6636                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6637                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6638                 WARN_ON(dev->dn_ptr);
6639
6640                 if (dev->destructor)
6641                         dev->destructor(dev);
6642
6643                 /* Report a network device has been unregistered */
6644                 rtnl_lock();
6645                 dev_net(dev)->dev_unreg_count--;
6646                 __rtnl_unlock();
6647                 wake_up(&netdev_unregistering_wq);
6648
6649                 /* Free network device */
6650                 kobject_put(&dev->dev.kobj);
6651         }
6652 }
6653
6654 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6655  * fields in the same order, with only the type differing.
6656  */
6657 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6658                              const struct net_device_stats *netdev_stats)
6659 {
6660 #if BITS_PER_LONG == 64
6661         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6662         memcpy(stats64, netdev_stats, sizeof(*stats64));
6663 #else
6664         size_t i, n = sizeof(*stats64) / sizeof(u64);
6665         const unsigned long *src = (const unsigned long *)netdev_stats;
6666         u64 *dst = (u64 *)stats64;
6667
6668         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6669                      sizeof(*stats64) / sizeof(u64));
6670         for (i = 0; i < n; i++)
6671                 dst[i] = src[i];
6672 #endif
6673 }
6674 EXPORT_SYMBOL(netdev_stats_to_stats64);
6675
6676 /**
6677  *      dev_get_stats   - get network device statistics
6678  *      @dev: device to get statistics from
6679  *      @storage: place to store stats
6680  *
6681  *      Get network statistics from device. Return @storage.
6682  *      The device driver may provide its own method by setting
6683  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6684  *      otherwise the internal statistics structure is used.
6685  */
6686 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6687                                         struct rtnl_link_stats64 *storage)
6688 {
6689         const struct net_device_ops *ops = dev->netdev_ops;
6690
6691         if (ops->ndo_get_stats64) {
6692                 memset(storage, 0, sizeof(*storage));
6693                 ops->ndo_get_stats64(dev, storage);
6694         } else if (ops->ndo_get_stats) {
6695                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6696         } else {
6697                 netdev_stats_to_stats64(storage, &dev->stats);
6698         }
6699         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6700         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6701         return storage;
6702 }
6703 EXPORT_SYMBOL(dev_get_stats);
6704
6705 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6706 {
6707         struct netdev_queue *queue = dev_ingress_queue(dev);
6708
6709 #ifdef CONFIG_NET_CLS_ACT
6710         if (queue)
6711                 return queue;
6712         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6713         if (!queue)
6714                 return NULL;
6715         netdev_init_one_queue(dev, queue, NULL);
6716         queue->qdisc = &noop_qdisc;
6717         queue->qdisc_sleeping = &noop_qdisc;
6718         rcu_assign_pointer(dev->ingress_queue, queue);
6719 #endif
6720         return queue;
6721 }
6722
6723 static const struct ethtool_ops default_ethtool_ops;
6724
6725 void netdev_set_default_ethtool_ops(struct net_device *dev,
6726                                     const struct ethtool_ops *ops)
6727 {
6728         if (dev->ethtool_ops == &default_ethtool_ops)
6729                 dev->ethtool_ops = ops;
6730 }
6731 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6732
6733 void netdev_freemem(struct net_device *dev)
6734 {
6735         char *addr = (char *)dev - dev->padded;
6736
6737         kvfree(addr);
6738 }
6739
6740 /**
6741  *      alloc_netdev_mqs - allocate network device
6742  *      @sizeof_priv:           size of private data to allocate space for
6743  *      @name:                  device name format string
6744  *      @name_assign_type:      origin of device name
6745  *      @setup:                 callback to initialize device
6746  *      @txqs:                  the number of TX subqueues to allocate
6747  *      @rxqs:                  the number of RX subqueues to allocate
6748  *
6749  *      Allocates a struct net_device with private data area for driver use
6750  *      and performs basic initialization.  Also allocates subqueue structs
6751  *      for each queue on the device.
6752  */
6753 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6754                 unsigned char name_assign_type,
6755                 void (*setup)(struct net_device *),
6756                 unsigned int txqs, unsigned int rxqs)
6757 {
6758         struct net_device *dev;
6759         size_t alloc_size;
6760         struct net_device *p;
6761
6762         BUG_ON(strlen(name) >= sizeof(dev->name));
6763
6764         if (txqs < 1) {
6765                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6766                 return NULL;
6767         }
6768
6769 #ifdef CONFIG_SYSFS
6770         if (rxqs < 1) {
6771                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6772                 return NULL;
6773         }
6774 #endif
6775
6776         alloc_size = sizeof(struct net_device);
6777         if (sizeof_priv) {
6778                 /* ensure 32-byte alignment of private area */
6779                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6780                 alloc_size += sizeof_priv;
6781         }
6782         /* ensure 32-byte alignment of whole construct */
6783         alloc_size += NETDEV_ALIGN - 1;
6784
6785         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6786         if (!p)
6787                 p = vzalloc(alloc_size);
6788         if (!p)
6789                 return NULL;
6790
6791         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6792         dev->padded = (char *)dev - (char *)p;
6793
6794         dev->pcpu_refcnt = alloc_percpu(int);
6795         if (!dev->pcpu_refcnt)
6796                 goto free_dev;
6797
6798         if (dev_addr_init(dev))
6799                 goto free_pcpu;
6800
6801         dev_mc_init(dev);
6802         dev_uc_init(dev);
6803
6804         dev_net_set(dev, &init_net);
6805
6806         dev->gso_max_size = GSO_MAX_SIZE;
6807         dev->gso_max_segs = GSO_MAX_SEGS;
6808         dev->gso_min_segs = 0;
6809
6810         INIT_LIST_HEAD(&dev->napi_list);
6811         INIT_LIST_HEAD(&dev->unreg_list);
6812         INIT_LIST_HEAD(&dev->close_list);
6813         INIT_LIST_HEAD(&dev->link_watch_list);
6814         INIT_LIST_HEAD(&dev->adj_list.upper);
6815         INIT_LIST_HEAD(&dev->adj_list.lower);
6816         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6817         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6818         INIT_LIST_HEAD(&dev->ptype_all);
6819         INIT_LIST_HEAD(&dev->ptype_specific);
6820         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6821         setup(dev);
6822
6823         dev->num_tx_queues = txqs;
6824         dev->real_num_tx_queues = txqs;
6825         if (netif_alloc_netdev_queues(dev))
6826                 goto free_all;
6827
6828 #ifdef CONFIG_SYSFS
6829         dev->num_rx_queues = rxqs;
6830         dev->real_num_rx_queues = rxqs;
6831         if (netif_alloc_rx_queues(dev))
6832                 goto free_all;
6833 #endif
6834
6835         strcpy(dev->name, name);
6836         dev->name_assign_type = name_assign_type;
6837         dev->group = INIT_NETDEV_GROUP;
6838         if (!dev->ethtool_ops)
6839                 dev->ethtool_ops = &default_ethtool_ops;
6840         return dev;
6841
6842 free_all:
6843         free_netdev(dev);
6844         return NULL;
6845
6846 free_pcpu:
6847         free_percpu(dev->pcpu_refcnt);
6848 free_dev:
6849         netdev_freemem(dev);
6850         return NULL;
6851 }
6852 EXPORT_SYMBOL(alloc_netdev_mqs);
6853
6854 /**
6855  *      free_netdev - free network device
6856  *      @dev: device
6857  *
6858  *      This function does the last stage of destroying an allocated device
6859  *      interface. The reference to the device object is released.
6860  *      If this is the last reference then it will be freed.
6861  */
6862 void free_netdev(struct net_device *dev)
6863 {
6864         struct napi_struct *p, *n;
6865
6866         release_net(dev_net(dev));
6867
6868         netif_free_tx_queues(dev);
6869 #ifdef CONFIG_SYSFS
6870         kvfree(dev->_rx);
6871 #endif
6872
6873         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6874
6875         /* Flush device addresses */
6876         dev_addr_flush(dev);
6877
6878         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6879                 netif_napi_del(p);
6880
6881         free_percpu(dev->pcpu_refcnt);
6882         dev->pcpu_refcnt = NULL;
6883
6884         /*  Compatibility with error handling in drivers */
6885         if (dev->reg_state == NETREG_UNINITIALIZED) {
6886                 netdev_freemem(dev);
6887                 return;
6888         }
6889
6890         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6891         dev->reg_state = NETREG_RELEASED;
6892
6893         /* will free via device release */
6894         put_device(&dev->dev);
6895 }
6896 EXPORT_SYMBOL(free_netdev);
6897
6898 /**
6899  *      synchronize_net -  Synchronize with packet receive processing
6900  *
6901  *      Wait for packets currently being received to be done.
6902  *      Does not block later packets from starting.
6903  */
6904 void synchronize_net(void)
6905 {
6906         might_sleep();
6907         if (rtnl_is_locked())
6908                 synchronize_rcu_expedited();
6909         else
6910                 synchronize_rcu();
6911 }
6912 EXPORT_SYMBOL(synchronize_net);
6913
6914 /**
6915  *      unregister_netdevice_queue - remove device from the kernel
6916  *      @dev: device
6917  *      @head: list
6918  *
6919  *      This function shuts down a device interface and removes it
6920  *      from the kernel tables.
6921  *      If head not NULL, device is queued to be unregistered later.
6922  *
6923  *      Callers must hold the rtnl semaphore.  You may want
6924  *      unregister_netdev() instead of this.
6925  */
6926
6927 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6928 {
6929         ASSERT_RTNL();
6930
6931         if (head) {
6932                 list_move_tail(&dev->unreg_list, head);
6933         } else {
6934                 rollback_registered(dev);
6935                 /* Finish processing unregister after unlock */
6936                 net_set_todo(dev);
6937         }
6938 }
6939 EXPORT_SYMBOL(unregister_netdevice_queue);
6940
6941 /**
6942  *      unregister_netdevice_many - unregister many devices
6943  *      @head: list of devices
6944  *
6945  *  Note: As most callers use a stack allocated list_head,
6946  *  we force a list_del() to make sure stack wont be corrupted later.
6947  */
6948 void unregister_netdevice_many(struct list_head *head)
6949 {
6950         struct net_device *dev;
6951
6952         if (!list_empty(head)) {
6953                 rollback_registered_many(head);
6954                 list_for_each_entry(dev, head, unreg_list)
6955                         net_set_todo(dev);
6956                 list_del(head);
6957         }
6958 }
6959 EXPORT_SYMBOL(unregister_netdevice_many);
6960
6961 /**
6962  *      unregister_netdev - remove device from the kernel
6963  *      @dev: device
6964  *
6965  *      This function shuts down a device interface and removes it
6966  *      from the kernel tables.
6967  *
6968  *      This is just a wrapper for unregister_netdevice that takes
6969  *      the rtnl semaphore.  In general you want to use this and not
6970  *      unregister_netdevice.
6971  */
6972 void unregister_netdev(struct net_device *dev)
6973 {
6974         rtnl_lock();
6975         unregister_netdevice(dev);
6976         rtnl_unlock();
6977 }
6978 EXPORT_SYMBOL(unregister_netdev);
6979
6980 /**
6981  *      dev_change_net_namespace - move device to different nethost namespace
6982  *      @dev: device
6983  *      @net: network namespace
6984  *      @pat: If not NULL name pattern to try if the current device name
6985  *            is already taken in the destination network namespace.
6986  *
6987  *      This function shuts down a device interface and moves it
6988  *      to a new network namespace. On success 0 is returned, on
6989  *      a failure a netagive errno code is returned.
6990  *
6991  *      Callers must hold the rtnl semaphore.
6992  */
6993
6994 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6995 {
6996         int err;
6997
6998         ASSERT_RTNL();
6999
7000         /* Don't allow namespace local devices to be moved. */
7001         err = -EINVAL;
7002         if (dev->features & NETIF_F_NETNS_LOCAL)
7003                 goto out;
7004
7005         /* Ensure the device has been registrered */
7006         if (dev->reg_state != NETREG_REGISTERED)
7007                 goto out;
7008
7009         /* Get out if there is nothing todo */
7010         err = 0;
7011         if (net_eq(dev_net(dev), net))
7012                 goto out;
7013
7014         /* Pick the destination device name, and ensure
7015          * we can use it in the destination network namespace.
7016          */
7017         err = -EEXIST;
7018         if (__dev_get_by_name(net, dev->name)) {
7019                 /* We get here if we can't use the current device name */
7020                 if (!pat)
7021                         goto out;
7022                 if (dev_get_valid_name(net, dev, pat) < 0)
7023                         goto out;
7024         }
7025
7026         /*
7027          * And now a mini version of register_netdevice unregister_netdevice.
7028          */
7029
7030         /* If device is running close it first. */
7031         dev_close(dev);
7032
7033         /* And unlink it from device chain */
7034         err = -ENODEV;
7035         unlist_netdevice(dev);
7036
7037         synchronize_net();
7038
7039         /* Shutdown queueing discipline. */
7040         dev_shutdown(dev);
7041
7042         /* Notify protocols, that we are about to destroy
7043            this device. They should clean all the things.
7044
7045            Note that dev->reg_state stays at NETREG_REGISTERED.
7046            This is wanted because this way 8021q and macvlan know
7047            the device is just moving and can keep their slaves up.
7048         */
7049         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7050         rcu_barrier();
7051         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7052         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7053
7054         /*
7055          *      Flush the unicast and multicast chains
7056          */
7057         dev_uc_flush(dev);
7058         dev_mc_flush(dev);
7059
7060         /* Send a netdev-removed uevent to the old namespace */
7061         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7062         netdev_adjacent_del_links(dev);
7063
7064         /* Actually switch the network namespace */
7065         dev_net_set(dev, net);
7066
7067         /* If there is an ifindex conflict assign a new one */
7068         if (__dev_get_by_index(net, dev->ifindex)) {
7069                 int iflink = (dev->iflink == dev->ifindex);
7070                 dev->ifindex = dev_new_index(net);
7071                 if (iflink)
7072                         dev->iflink = dev->ifindex;
7073         }
7074
7075         /* Send a netdev-add uevent to the new namespace */
7076         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7077         netdev_adjacent_add_links(dev);
7078
7079         /* Fixup kobjects */
7080         err = device_rename(&dev->dev, dev->name);
7081         WARN_ON(err);
7082
7083         /* Add the device back in the hashes */
7084         list_netdevice(dev);
7085
7086         /* Notify protocols, that a new device appeared. */
7087         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7088
7089         /*
7090          *      Prevent userspace races by waiting until the network
7091          *      device is fully setup before sending notifications.
7092          */
7093         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7094
7095         synchronize_net();
7096         err = 0;
7097 out:
7098         return err;
7099 }
7100 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7101
7102 static int dev_cpu_callback(struct notifier_block *nfb,
7103                             unsigned long action,
7104                             void *ocpu)
7105 {
7106         struct sk_buff **list_skb;
7107         struct sk_buff *skb;
7108         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7109         struct softnet_data *sd, *oldsd;
7110
7111         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7112                 return NOTIFY_OK;
7113
7114         local_irq_disable();
7115         cpu = smp_processor_id();
7116         sd = &per_cpu(softnet_data, cpu);
7117         oldsd = &per_cpu(softnet_data, oldcpu);
7118
7119         /* Find end of our completion_queue. */
7120         list_skb = &sd->completion_queue;
7121         while (*list_skb)
7122                 list_skb = &(*list_skb)->next;
7123         /* Append completion queue from offline CPU. */
7124         *list_skb = oldsd->completion_queue;
7125         oldsd->completion_queue = NULL;
7126
7127         /* Append output queue from offline CPU. */
7128         if (oldsd->output_queue) {
7129                 *sd->output_queue_tailp = oldsd->output_queue;
7130                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7131                 oldsd->output_queue = NULL;
7132                 oldsd->output_queue_tailp = &oldsd->output_queue;
7133         }
7134         /* Append NAPI poll list from offline CPU, with one exception :
7135          * process_backlog() must be called by cpu owning percpu backlog.
7136          * We properly handle process_queue & input_pkt_queue later.
7137          */
7138         while (!list_empty(&oldsd->poll_list)) {
7139                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7140                                                             struct napi_struct,
7141                                                             poll_list);
7142
7143                 list_del_init(&napi->poll_list);
7144                 if (napi->poll == process_backlog)
7145                         napi->state = 0;
7146                 else
7147                         ____napi_schedule(sd, napi);
7148         }
7149
7150         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7151         local_irq_enable();
7152
7153         /* Process offline CPU's input_pkt_queue */
7154         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7155                 netif_rx_internal(skb);
7156                 input_queue_head_incr(oldsd);
7157         }
7158         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7159                 netif_rx_internal(skb);
7160                 input_queue_head_incr(oldsd);
7161         }
7162
7163         return NOTIFY_OK;
7164 }
7165
7166
7167 /**
7168  *      netdev_increment_features - increment feature set by one
7169  *      @all: current feature set
7170  *      @one: new feature set
7171  *      @mask: mask feature set
7172  *
7173  *      Computes a new feature set after adding a device with feature set
7174  *      @one to the master device with current feature set @all.  Will not
7175  *      enable anything that is off in @mask. Returns the new feature set.
7176  */
7177 netdev_features_t netdev_increment_features(netdev_features_t all,
7178         netdev_features_t one, netdev_features_t mask)
7179 {
7180         if (mask & NETIF_F_GEN_CSUM)
7181                 mask |= NETIF_F_ALL_CSUM;
7182         mask |= NETIF_F_VLAN_CHALLENGED;
7183
7184         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7185         all &= one | ~NETIF_F_ALL_FOR_ALL;
7186
7187         /* If one device supports hw checksumming, set for all. */
7188         if (all & NETIF_F_GEN_CSUM)
7189                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7190
7191         return all;
7192 }
7193 EXPORT_SYMBOL(netdev_increment_features);
7194
7195 static struct hlist_head * __net_init netdev_create_hash(void)
7196 {
7197         int i;
7198         struct hlist_head *hash;
7199
7200         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7201         if (hash != NULL)
7202                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7203                         INIT_HLIST_HEAD(&hash[i]);
7204
7205         return hash;
7206 }
7207
7208 /* Initialize per network namespace state */
7209 static int __net_init netdev_init(struct net *net)
7210 {
7211         if (net != &init_net)
7212                 INIT_LIST_HEAD(&net->dev_base_head);
7213
7214         net->dev_name_head = netdev_create_hash();
7215         if (net->dev_name_head == NULL)
7216                 goto err_name;
7217
7218         net->dev_index_head = netdev_create_hash();
7219         if (net->dev_index_head == NULL)
7220                 goto err_idx;
7221
7222         return 0;
7223
7224 err_idx:
7225         kfree(net->dev_name_head);
7226 err_name:
7227         return -ENOMEM;
7228 }
7229
7230 /**
7231  *      netdev_drivername - network driver for the device
7232  *      @dev: network device
7233  *
7234  *      Determine network driver for device.
7235  */
7236 const char *netdev_drivername(const struct net_device *dev)
7237 {
7238         const struct device_driver *driver;
7239         const struct device *parent;
7240         const char *empty = "";
7241
7242         parent = dev->dev.parent;
7243         if (!parent)
7244                 return empty;
7245
7246         driver = parent->driver;
7247         if (driver && driver->name)
7248                 return driver->name;
7249         return empty;
7250 }
7251
7252 static void __netdev_printk(const char *level, const struct net_device *dev,
7253                             struct va_format *vaf)
7254 {
7255         if (dev && dev->dev.parent) {
7256                 dev_printk_emit(level[1] - '0',
7257                                 dev->dev.parent,
7258                                 "%s %s %s%s: %pV",
7259                                 dev_driver_string(dev->dev.parent),
7260                                 dev_name(dev->dev.parent),
7261                                 netdev_name(dev), netdev_reg_state(dev),
7262                                 vaf);
7263         } else if (dev) {
7264                 printk("%s%s%s: %pV",
7265                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7266         } else {
7267                 printk("%s(NULL net_device): %pV", level, vaf);
7268         }
7269 }
7270
7271 void netdev_printk(const char *level, const struct net_device *dev,
7272                    const char *format, ...)
7273 {
7274         struct va_format vaf;
7275         va_list args;
7276
7277         va_start(args, format);
7278
7279         vaf.fmt = format;
7280         vaf.va = &args;
7281
7282         __netdev_printk(level, dev, &vaf);
7283
7284         va_end(args);
7285 }
7286 EXPORT_SYMBOL(netdev_printk);
7287
7288 #define define_netdev_printk_level(func, level)                 \
7289 void func(const struct net_device *dev, const char *fmt, ...)   \
7290 {                                                               \
7291         struct va_format vaf;                                   \
7292         va_list args;                                           \
7293                                                                 \
7294         va_start(args, fmt);                                    \
7295                                                                 \
7296         vaf.fmt = fmt;                                          \
7297         vaf.va = &args;                                         \
7298                                                                 \
7299         __netdev_printk(level, dev, &vaf);                      \
7300                                                                 \
7301         va_end(args);                                           \
7302 }                                                               \
7303 EXPORT_SYMBOL(func);
7304
7305 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7306 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7307 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7308 define_netdev_printk_level(netdev_err, KERN_ERR);
7309 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7310 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7311 define_netdev_printk_level(netdev_info, KERN_INFO);
7312
7313 static void __net_exit netdev_exit(struct net *net)
7314 {
7315         kfree(net->dev_name_head);
7316         kfree(net->dev_index_head);
7317 }
7318
7319 static struct pernet_operations __net_initdata netdev_net_ops = {
7320         .init = netdev_init,
7321         .exit = netdev_exit,
7322 };
7323
7324 static void __net_exit default_device_exit(struct net *net)
7325 {
7326         struct net_device *dev, *aux;
7327         /*
7328          * Push all migratable network devices back to the
7329          * initial network namespace
7330          */
7331         rtnl_lock();
7332         for_each_netdev_safe(net, dev, aux) {
7333                 int err;
7334                 char fb_name[IFNAMSIZ];
7335
7336                 /* Ignore unmoveable devices (i.e. loopback) */
7337                 if (dev->features & NETIF_F_NETNS_LOCAL)
7338                         continue;
7339
7340                 /* Leave virtual devices for the generic cleanup */
7341                 if (dev->rtnl_link_ops)
7342                         continue;
7343
7344                 /* Push remaining network devices to init_net */
7345                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7346                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7347                 if (err) {
7348                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7349                                  __func__, dev->name, err);
7350                         BUG();
7351                 }
7352         }
7353         rtnl_unlock();
7354 }
7355
7356 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7357 {
7358         /* Return with the rtnl_lock held when there are no network
7359          * devices unregistering in any network namespace in net_list.
7360          */
7361         struct net *net;
7362         bool unregistering;
7363         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7364
7365         add_wait_queue(&netdev_unregistering_wq, &wait);
7366         for (;;) {
7367                 unregistering = false;
7368                 rtnl_lock();
7369                 list_for_each_entry(net, net_list, exit_list) {
7370                         if (net->dev_unreg_count > 0) {
7371                                 unregistering = true;
7372                                 break;
7373                         }
7374                 }
7375                 if (!unregistering)
7376                         break;
7377                 __rtnl_unlock();
7378
7379                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7380         }
7381         remove_wait_queue(&netdev_unregistering_wq, &wait);
7382 }
7383
7384 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7385 {
7386         /* At exit all network devices most be removed from a network
7387          * namespace.  Do this in the reverse order of registration.
7388          * Do this across as many network namespaces as possible to
7389          * improve batching efficiency.
7390          */
7391         struct net_device *dev;
7392         struct net *net;
7393         LIST_HEAD(dev_kill_list);
7394
7395         /* To prevent network device cleanup code from dereferencing
7396          * loopback devices or network devices that have been freed
7397          * wait here for all pending unregistrations to complete,
7398          * before unregistring the loopback device and allowing the
7399          * network namespace be freed.
7400          *
7401          * The netdev todo list containing all network devices
7402          * unregistrations that happen in default_device_exit_batch
7403          * will run in the rtnl_unlock() at the end of
7404          * default_device_exit_batch.
7405          */
7406         rtnl_lock_unregistering(net_list);
7407         list_for_each_entry(net, net_list, exit_list) {
7408                 for_each_netdev_reverse(net, dev) {
7409                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7410                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7411                         else
7412                                 unregister_netdevice_queue(dev, &dev_kill_list);
7413                 }
7414         }
7415         unregister_netdevice_many(&dev_kill_list);
7416         rtnl_unlock();
7417 }
7418
7419 static struct pernet_operations __net_initdata default_device_ops = {
7420         .exit = default_device_exit,
7421         .exit_batch = default_device_exit_batch,
7422 };
7423
7424 /*
7425  *      Initialize the DEV module. At boot time this walks the device list and
7426  *      unhooks any devices that fail to initialise (normally hardware not
7427  *      present) and leaves us with a valid list of present and active devices.
7428  *
7429  */
7430
7431 /*
7432  *       This is called single threaded during boot, so no need
7433  *       to take the rtnl semaphore.
7434  */
7435 static int __init net_dev_init(void)
7436 {
7437         int i, rc = -ENOMEM;
7438
7439         BUG_ON(!dev_boot_phase);
7440
7441         if (dev_proc_init())
7442                 goto out;
7443
7444         if (netdev_kobject_init())
7445                 goto out;
7446
7447         INIT_LIST_HEAD(&ptype_all);
7448         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7449                 INIT_LIST_HEAD(&ptype_base[i]);
7450
7451         INIT_LIST_HEAD(&offload_base);
7452
7453         if (register_pernet_subsys(&netdev_net_ops))
7454                 goto out;
7455
7456         /*
7457          *      Initialise the packet receive queues.
7458          */
7459
7460         for_each_possible_cpu(i) {
7461                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7462
7463                 skb_queue_head_init(&sd->input_pkt_queue);
7464                 skb_queue_head_init(&sd->process_queue);
7465                 INIT_LIST_HEAD(&sd->poll_list);
7466                 sd->output_queue_tailp = &sd->output_queue;
7467 #ifdef CONFIG_RPS
7468                 sd->csd.func = rps_trigger_softirq;
7469                 sd->csd.info = sd;
7470                 sd->cpu = i;
7471 #endif
7472
7473                 sd->backlog.poll = process_backlog;
7474                 sd->backlog.weight = weight_p;
7475         }
7476
7477         dev_boot_phase = 0;
7478
7479         /* The loopback device is special if any other network devices
7480          * is present in a network namespace the loopback device must
7481          * be present. Since we now dynamically allocate and free the
7482          * loopback device ensure this invariant is maintained by
7483          * keeping the loopback device as the first device on the
7484          * list of network devices.  Ensuring the loopback devices
7485          * is the first device that appears and the last network device
7486          * that disappears.
7487          */
7488         if (register_pernet_device(&loopback_net_ops))
7489                 goto out;
7490
7491         if (register_pernet_device(&default_device_ops))
7492                 goto out;
7493
7494         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7495         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7496
7497         hotcpu_notifier(dev_cpu_callback, 0);
7498         dst_init();
7499         rc = 0;
7500 out:
7501         return rc;
7502 }
7503
7504 subsys_initcall(net_dev_init);