net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138 #include <linux/netfilter_ingress.h>
 139
 140 #include "net-sysfs.h"
 141
 142 /* Instead of increasing this, you should create a hash table. */
 143 #define MAX_GRO_SKBS 8
 144
 145 /* This should be increased if a protocol with a bigger head is added. */
 146 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 147
 148 static DEFINE_SPINLOCK(ptype_lock);
 149 static DEFINE_SPINLOCK(offload_lock);
 150 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 151 struct list_head ptype_all __read_mostly;       /* Taps */
 152 static struct list_head offload_base __read_mostly;
 153
 154 static int netif_rx_internal(struct sk_buff *skb);
 155 static int call_netdevice_notifiers_info(unsigned long val,
 156                                          struct net_device *dev,
 157                                          struct netdev_notifier_info *info);
 158
 159 /*
 160  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 161  * semaphore.
 162  *
 163  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 164  *
 165  * Writers must hold the rtnl semaphore while they loop through the
 166  * dev_base_head list, and hold dev_base_lock for writing when they do the
 167  * actual updates.  This allows pure readers to access the list even
 168  * while a writer is preparing to update it.
 169  *
 170  * To put it another way, dev_base_lock is held for writing only to
 171  * protect against pure readers; the rtnl semaphore provides the
 172  * protection against other writers.
 173  *
 174  * See, for example usages, register_netdevice() and
 175  * unregister_netdevice(), which must be called with the rtnl
 176  * semaphore held.
 177  */
 178 DEFINE_RWLOCK(dev_base_lock);
 179 EXPORT_SYMBOL(dev_base_lock);
 180
 181 /* protects napi_hash addition/deletion and napi_gen_id */
 182 static DEFINE_SPINLOCK(napi_hash_lock);
 183
 184 static unsigned int napi_gen_id;
 185 static DEFINE_HASHTABLE(napi_hash, 8);
 186
 187 static seqcount_t devnet_rename_seq;
 188
 189 static inline void dev_base_seq_inc(struct net *net)
 190 {
 191         while (++net->dev_base_seq == 0);
 192 }
 193
 194 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 195 {
 196         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 197
 198         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 199 }
 200
 201 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 202 {
 203         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 204 }
 205
 206 static inline void rps_lock(struct softnet_data *sd)
 207 {
 208 #ifdef CONFIG_RPS
 209         spin_lock(&sd->input_pkt_queue.lock);
 210 #endif
 211 }
 212
 213 static inline void rps_unlock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_unlock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 /* Device list insertion */
 221 static void list_netdevice(struct net_device *dev)
 222 {
 223         struct net *net = dev_net(dev);
 224
 225         ASSERT_RTNL();
 226
 227         write_lock_bh(&dev_base_lock);
 228         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 229         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 230         hlist_add_head_rcu(&dev->index_hlist,
 231                            dev_index_hash(net, dev->ifindex));
 232         write_unlock_bh(&dev_base_lock);
 233
 234         dev_base_seq_inc(net);
 235 }
 236
 237 /* Device list removal
 238  * caller must respect a RCU grace period before freeing/reusing dev
 239  */
 240 static void unlist_netdevice(struct net_device *dev)
 241 {
 242         ASSERT_RTNL();
 243
 244         /* Unlink dev from the device chain */
 245         write_lock_bh(&dev_base_lock);
 246         list_del_rcu(&dev->dev_list);
 247         hlist_del_rcu(&dev->name_hlist);
 248         hlist_del_rcu(&dev->index_hlist);
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(dev_net(dev));
 252 }
 253
 254 /*
 255  *      Our notifier list
 256  */
 257
 258 static RAW_NOTIFIER_HEAD(netdev_chain);
 259
 260 /*
 261  *      Device drivers call our routines to queue packets here. We empty the
 262  *      queue in the local softnet handler.
 263  */
 264
 265 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 266 EXPORT_PER_CPU_SYMBOL(softnet_data);
 267
 268 #ifdef CONFIG_LOCKDEP
 269 /*
 270  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 271  * according to dev->type
 272  */
 273 static const unsigned short netdev_lock_type[] =
 274         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 275          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 276          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 277          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 278          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 279          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 280          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 281          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 282          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 283          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 284          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 285          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 286          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 287          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 288          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 289
 290 static const char *const netdev_lock_name[] =
 291         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 292          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 293          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 294          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 295          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 296          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 297          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 298          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 299          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 300          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 301          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 302          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 303          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 304          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 305          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 306
 307 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 309
 310 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 311 {
 312         int i;
 313
 314         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 315                 if (netdev_lock_type[i] == dev_type)
 316                         return i;
 317         /* the last key is used by default */
 318         return ARRAY_SIZE(netdev_lock_type) - 1;
 319 }
 320
 321 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 322                                                  unsigned short dev_type)
 323 {
 324         int i;
 325
 326         i = netdev_lock_pos(dev_type);
 327         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 328                                    netdev_lock_name[i]);
 329 }
 330
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333         int i;
 334
 335         i = netdev_lock_pos(dev->type);
 336         lockdep_set_class_and_name(&dev->addr_list_lock,
 337                                    &netdev_addr_lock_key[i],
 338                                    netdev_lock_name[i]);
 339 }
 340 #else
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344 }
 345 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 346 {
 347 }
 348 #endif
 349
 350 /*******************************************************************************
 351
 352                 Protocol management and registration routines
 353
 354 *******************************************************************************/
 355
 356 /*
 357  *      Add a protocol ID to the list. Now that the input handler is
 358  *      smarter we can dispense with all the messy stuff that used to be
 359  *      here.
 360  *
 361  *      BEWARE!!! Protocol handlers, mangling input packets,
 362  *      MUST BE last in hash buckets and checking protocol handlers
 363  *      MUST start from promiscuous ptype_all chain in net_bh.
 364  *      It is true now, do not change it.
 365  *      Explanation follows: if protocol handler, mangling packet, will
 366  *      be the first on list, it is not able to sense, that packet
 367  *      is cloned and should be copied-on-write, so that it will
 368  *      change it and subsequent readers will get broken packet.
 369  *                                                      --ANK (980803)
 370  */
 371
 372 static inline struct list_head *ptype_head(const struct packet_type *pt)
 373 {
 374         if (pt->type == htons(ETH_P_ALL))
 375                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 376         else
 377                 return pt->dev ? &pt->dev->ptype_specific :
 378                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 379 }
 380
 381 /**
 382  *      dev_add_pack - add packet handler
 383  *      @pt: packet type declaration
 384  *
 385  *      Add a protocol handler to the networking stack. The passed &packet_type
 386  *      is linked into kernel lists and may not be freed until it has been
 387  *      removed from the kernel lists.
 388  *
 389  *      This call does not sleep therefore it can not
 390  *      guarantee all CPU's that are in middle of receiving packets
 391  *      will see the new packet type (until the next received packet).
 392  */
 393
 394 void dev_add_pack(struct packet_type *pt)
 395 {
 396         struct list_head *head = ptype_head(pt);
 397
 398         spin_lock(&ptype_lock);
 399         list_add_rcu(&pt->list, head);
 400         spin_unlock(&ptype_lock);
 401 }
 402 EXPORT_SYMBOL(dev_add_pack);
 403
 404 /**
 405  *      __dev_remove_pack        - remove packet handler
 406  *      @pt: packet type declaration
 407  *
 408  *      Remove a protocol handler that was previously added to the kernel
 409  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 410  *      from the kernel lists and can be freed or reused once this function
 411  *      returns.
 412  *
 413  *      The packet type might still be in use by receivers
 414  *      and must not be freed until after all the CPU's have gone
 415  *      through a quiescent state.
 416  */
 417 void __dev_remove_pack(struct packet_type *pt)
 418 {
 419         struct list_head *head = ptype_head(pt);
 420         struct packet_type *pt1;
 421
 422         spin_lock(&ptype_lock);
 423
 424         list_for_each_entry(pt1, head, list) {
 425                 if (pt == pt1) {
 426                         list_del_rcu(&pt->list);
 427                         goto out;
 428                 }
 429         }
 430
 431         pr_warn("dev_remove_pack: %p not found\n", pt);
 432 out:
 433         spin_unlock(&ptype_lock);
 434 }
 435 EXPORT_SYMBOL(__dev_remove_pack);
 436
 437 /**
 438  *      dev_remove_pack  - remove packet handler
 439  *      @pt: packet type declaration
 440  *
 441  *      Remove a protocol handler that was previously added to the kernel
 442  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 443  *      from the kernel lists and can be freed or reused once this function
 444  *      returns.
 445  *
 446  *      This call sleeps to guarantee that no CPU is looking at the packet
 447  *      type after return.
 448  */
 449 void dev_remove_pack(struct packet_type *pt)
 450 {
 451         __dev_remove_pack(pt);
 452
 453         synchronize_net();
 454 }
 455 EXPORT_SYMBOL(dev_remove_pack);
 456
 457
 458 /**
 459  *      dev_add_offload - register offload handlers
 460  *      @po: protocol offload declaration
 461  *
 462  *      Add protocol offload handlers to the networking stack. The passed
 463  *      &proto_offload is linked into kernel lists and may not be freed until
 464  *      it has been removed from the kernel lists.
 465  *
 466  *      This call does not sleep therefore it can not
 467  *      guarantee all CPU's that are in middle of receiving packets
 468  *      will see the new offload handlers (until the next received packet).
 469  */
 470 void dev_add_offload(struct packet_offload *po)
 471 {
 472         struct packet_offload *elem;
 473
 474         spin_lock(&offload_lock);
 475         list_for_each_entry(elem, &offload_base, list) {
 476                 if (po->priority < elem->priority)
 477                         break;
 478         }
 479         list_add_rcu(&po->list, elem->list.prev);
 480         spin_unlock(&offload_lock);
 481 }
 482 EXPORT_SYMBOL(dev_add_offload);
 483
 484 /**
 485  *      __dev_remove_offload     - remove offload handler
 486  *      @po: packet offload declaration
 487  *
 488  *      Remove a protocol offload handler that was previously added to the
 489  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 490  *      is removed from the kernel lists and can be freed or reused once this
 491  *      function returns.
 492  *
 493  *      The packet type might still be in use by receivers
 494  *      and must not be freed until after all the CPU's have gone
 495  *      through a quiescent state.
 496  */
 497 static void __dev_remove_offload(struct packet_offload *po)
 498 {
 499         struct list_head *head = &offload_base;
 500         struct packet_offload *po1;
 501
 502         spin_lock(&offload_lock);
 503
 504         list_for_each_entry(po1, head, list) {
 505                 if (po == po1) {
 506                         list_del_rcu(&po->list);
 507                         goto out;
 508                 }
 509         }
 510
 511         pr_warn("dev_remove_offload: %p not found\n", po);
 512 out:
 513         spin_unlock(&offload_lock);
 514 }
 515
 516 /**
 517  *      dev_remove_offload       - remove packet offload handler
 518  *      @po: packet offload declaration
 519  *
 520  *      Remove a packet offload handler that was previously added to the kernel
 521  *      offload handlers by dev_add_offload(). The passed &offload_type is
 522  *      removed from the kernel lists and can be freed or reused once this
 523  *      function returns.
 524  *
 525  *      This call sleeps to guarantee that no CPU is looking at the packet
 526  *      type after return.
 527  */
 528 void dev_remove_offload(struct packet_offload *po)
 529 {
 530         __dev_remove_offload(po);
 531
 532         synchronize_net();
 533 }
 534 EXPORT_SYMBOL(dev_remove_offload);
 535
 536 /******************************************************************************
 537
 538                       Device Boot-time Settings Routines
 539
 540 *******************************************************************************/
 541
 542 /* Boot time configuration table */
 543 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 544
 545 /**
 546  *      netdev_boot_setup_add   - add new setup entry
 547  *      @name: name of the device
 548  *      @map: configured settings for the device
 549  *
 550  *      Adds new setup entry to the dev_boot_setup list.  The function
 551  *      returns 0 on error and 1 on success.  This is a generic routine to
 552  *      all netdevices.
 553  */
 554 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 555 {
 556         struct netdev_boot_setup *s;
 557         int i;
 558
 559         s = dev_boot_setup;
 560         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 561                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 562                         memset(s[i].name, 0, sizeof(s[i].name));
 563                         strlcpy(s[i].name, name, IFNAMSIZ);
 564                         memcpy(&s[i].map, map, sizeof(s[i].map));
 565                         break;
 566                 }
 567         }
 568
 569         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 570 }
 571
 572 /**
 573  *      netdev_boot_setup_check - check boot time settings
 574  *      @dev: the netdevice
 575  *
 576  *      Check boot time settings for the device.
 577  *      The found settings are set for the device to be used
 578  *      later in the device probing.
 579  *      Returns 0 if no settings found, 1 if they are.
 580  */
 581 int netdev_boot_setup_check(struct net_device *dev)
 582 {
 583         struct netdev_boot_setup *s = dev_boot_setup;
 584         int i;
 585
 586         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 587                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 588                     !strcmp(dev->name, s[i].name)) {
 589                         dev->irq        = s[i].map.irq;
 590                         dev->base_addr  = s[i].map.base_addr;
 591                         dev->mem_start  = s[i].map.mem_start;
 592                         dev->mem_end    = s[i].map.mem_end;
 593                         return 1;
 594                 }
 595         }
 596         return 0;
 597 }
 598 EXPORT_SYMBOL(netdev_boot_setup_check);
 599
 600
 601 /**
 602  *      netdev_boot_base        - get address from boot time settings
 603  *      @prefix: prefix for network device
 604  *      @unit: id for network device
 605  *
 606  *      Check boot time settings for the base address of device.
 607  *      The found settings are set for the device to be used
 608  *      later in the device probing.
 609  *      Returns 0 if no settings found.
 610  */
 611 unsigned long netdev_boot_base(const char *prefix, int unit)
 612 {
 613         const struct netdev_boot_setup *s = dev_boot_setup;
 614         char name[IFNAMSIZ];
 615         int i;
 616
 617         sprintf(name, "%s%d", prefix, unit);
 618
 619         /*
 620          * If device already registered then return base of 1
 621          * to indicate not to probe for this interface
 622          */
 623         if (__dev_get_by_name(&init_net, name))
 624                 return 1;
 625
 626         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 627                 if (!strcmp(name, s[i].name))
 628                         return s[i].map.base_addr;
 629         return 0;
 630 }
 631
 632 /*
 633  * Saves at boot time configured settings for any netdevice.
 634  */
 635 int __init netdev_boot_setup(char *str)
 636 {
 637         int ints[5];
 638         struct ifmap map;
 639
 640         str = get_options(str, ARRAY_SIZE(ints), ints);
 641         if (!str || !*str)
 642                 return 0;
 643
 644         /* Save settings */
 645         memset(&map, 0, sizeof(map));
 646         if (ints[0] > 0)
 647                 map.irq = ints[1];
 648         if (ints[0] > 1)
 649                 map.base_addr = ints[2];
 650         if (ints[0] > 2)
 651                 map.mem_start = ints[3];
 652         if (ints[0] > 3)
 653                 map.mem_end = ints[4];
 654
 655         /* Add new entry to the list */
 656         return netdev_boot_setup_add(str, &map);
 657 }
 658
 659 __setup("netdev=", netdev_boot_setup);
 660
 661 /*******************************************************************************
 662
 663                             Device Interface Subroutines
 664
 665 *******************************************************************************/
 666
 667 /**
 668  *      dev_get_iflink  - get 'iflink' value of a interface
 669  *      @dev: targeted interface
 670  *
 671  *      Indicates the ifindex the interface is linked to.
 672  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 673  */
 674
 675 int dev_get_iflink(const struct net_device *dev)
 676 {
 677         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 678                 return dev->netdev_ops->ndo_get_iflink(dev);
 679
 680         return dev->ifindex;
 681 }
 682 EXPORT_SYMBOL(dev_get_iflink);
 683
 684 /**
 685  *      __dev_get_by_name       - find a device by its name
 686  *      @net: the applicable net namespace
 687  *      @name: name to find
 688  *
 689  *      Find an interface by name. Must be called under RTNL semaphore
 690  *      or @dev_base_lock. If the name is found a pointer to the device
 691  *      is returned. If the name is not found then %NULL is returned. The
 692  *      reference counters are not incremented so the caller must be
 693  *      careful with locks.
 694  */
 695
 696 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 697 {
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_name_hash(net, name);
 700
 701         hlist_for_each_entry(dev, head, name_hlist)
 702                 if (!strncmp(dev->name, name, IFNAMSIZ))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(__dev_get_by_name);
 708
 709 /**
 710  *      dev_get_by_name_rcu     - find a device by its name
 711  *      @net: the applicable net namespace
 712  *      @name: name to find
 713  *
 714  *      Find an interface by name.
 715  *      If the name is found a pointer to the device is returned.
 716  *      If the name is not found then %NULL is returned.
 717  *      The reference counters are not incremented so the caller must be
 718  *      careful with locks. The caller must hold RCU lock.
 719  */
 720
 721 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 722 {
 723         struct net_device *dev;
 724         struct hlist_head *head = dev_name_hash(net, name);
 725
 726         hlist_for_each_entry_rcu(dev, head, name_hlist)
 727                 if (!strncmp(dev->name, name, IFNAMSIZ))
 728                         return dev;
 729
 730         return NULL;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_name_rcu);
 733
 734 /**
 735  *      dev_get_by_name         - find a device by its name
 736  *      @net: the applicable net namespace
 737  *      @name: name to find
 738  *
 739  *      Find an interface by name. This can be called from any
 740  *      context and does its own locking. The returned handle has
 741  *      the usage count incremented and the caller must use dev_put() to
 742  *      release it when it is no longer needed. %NULL is returned if no
 743  *      matching device is found.
 744  */
 745
 746 struct net_device *dev_get_by_name(struct net *net, const char *name)
 747 {
 748         struct net_device *dev;
 749
 750         rcu_read_lock();
 751         dev = dev_get_by_name_rcu(net, name);
 752         if (dev)
 753                 dev_hold(dev);
 754         rcu_read_unlock();
 755         return dev;
 756 }
 757 EXPORT_SYMBOL(dev_get_by_name);
 758
 759 /**
 760  *      __dev_get_by_index - find a device by its ifindex
 761  *      @net: the applicable net namespace
 762  *      @ifindex: index of device
 763  *
 764  *      Search for an interface by index. Returns %NULL if the device
 765  *      is not found or a pointer to the device. The device has not
 766  *      had its reference counter increased so the caller must be careful
 767  *      about locking. The caller must hold either the RTNL semaphore
 768  *      or @dev_base_lock.
 769  */
 770
 771 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 772 {
 773         struct net_device *dev;
 774         struct hlist_head *head = dev_index_hash(net, ifindex);
 775
 776         hlist_for_each_entry(dev, head, index_hlist)
 777                 if (dev->ifindex == ifindex)
 778                         return dev;
 779
 780         return NULL;
 781 }
 782 EXPORT_SYMBOL(__dev_get_by_index);
 783
 784 /**
 785  *      dev_get_by_index_rcu - find a device by its ifindex
 786  *      @net: the applicable net namespace
 787  *      @ifindex: index of device
 788  *
 789  *      Search for an interface by index. Returns %NULL if the device
 790  *      is not found or a pointer to the device. The device has not
 791  *      had its reference counter increased so the caller must be careful
 792  *      about locking. The caller must hold RCU lock.
 793  */
 794
 795 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 796 {
 797         struct net_device *dev;
 798         struct hlist_head *head = dev_index_hash(net, ifindex);
 799
 800         hlist_for_each_entry_rcu(dev, head, index_hlist)
 801                 if (dev->ifindex == ifindex)
 802                         return dev;
 803
 804         return NULL;
 805 }
 806 EXPORT_SYMBOL(dev_get_by_index_rcu);
 807
 808
 809 /**
 810  *      dev_get_by_index - find a device by its ifindex
 811  *      @net: the applicable net namespace
 812  *      @ifindex: index of device
 813  *
 814  *      Search for an interface by index. Returns NULL if the device
 815  *      is not found or a pointer to the device. The device returned has
 816  *      had a reference added and the pointer is safe until the user calls
 817  *      dev_put to indicate they have finished with it.
 818  */
 819
 820 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 821 {
 822         struct net_device *dev;
 823
 824         rcu_read_lock();
 825         dev = dev_get_by_index_rcu(net, ifindex);
 826         if (dev)
 827                 dev_hold(dev);
 828         rcu_read_unlock();
 829         return dev;
 830 }
 831 EXPORT_SYMBOL(dev_get_by_index);
 832
 833 /**
 834  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 835  *      @net: network namespace
 836  *      @name: a pointer to the buffer where the name will be stored.
 837  *      @ifindex: the ifindex of the interface to get the name from.
 838  *
 839  *      The use of raw_seqcount_begin() and cond_resched() before
 840  *      retrying is required as we want to give the writers a chance
 841  *      to complete when CONFIG_PREEMPT is not set.
 842  */
 843 int netdev_get_name(struct net *net, char *name, int ifindex)
 844 {
 845         struct net_device *dev;
 846         unsigned int seq;
 847
 848 retry:
 849         seq = raw_seqcount_begin(&devnet_rename_seq);
 850         rcu_read_lock();
 851         dev = dev_get_by_index_rcu(net, ifindex);
 852         if (!dev) {
 853                 rcu_read_unlock();
 854                 return -ENODEV;
 855         }
 856
 857         strcpy(name, dev->name);
 858         rcu_read_unlock();
 859         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 860                 cond_resched();
 861                 goto retry;
 862         }
 863
 864         return 0;
 865 }
 866
 867 /**
 868  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 869  *      @net: the applicable net namespace
 870  *      @type: media type of device
 871  *      @ha: hardware address
 872  *
 873  *      Search for an interface by MAC address. Returns NULL if the device
 874  *      is not found or a pointer to the device.
 875  *      The caller must hold RCU or RTNL.
 876  *      The returned device has not had its ref count increased
 877  *      and the caller must therefore be careful about locking
 878  *
 879  */
 880
 881 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 882                                        const char *ha)
 883 {
 884         struct net_device *dev;
 885
 886         for_each_netdev_rcu(net, dev)
 887                 if (dev->type == type &&
 888                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 889                         return dev;
 890
 891         return NULL;
 892 }
 893 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 894
 895 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 896 {
 897         struct net_device *dev;
 898
 899         ASSERT_RTNL();
 900         for_each_netdev(net, dev)
 901                 if (dev->type == type)
 902                         return dev;
 903
 904         return NULL;
 905 }
 906 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 907
 908 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 909 {
 910         struct net_device *dev, *ret = NULL;
 911
 912         rcu_read_lock();
 913         for_each_netdev_rcu(net, dev)
 914                 if (dev->type == type) {
 915                         dev_hold(dev);
 916                         ret = dev;
 917                         break;
 918                 }
 919         rcu_read_unlock();
 920         return ret;
 921 }
 922 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 923
 924 /**
 925  *      __dev_get_by_flags - find any device with given flags
 926  *      @net: the applicable net namespace
 927  *      @if_flags: IFF_* values
 928  *      @mask: bitmask of bits in if_flags to check
 929  *
 930  *      Search for any interface with the given flags. Returns NULL if a device
 931  *      is not found or a pointer to the device. Must be called inside
 932  *      rtnl_lock(), and result refcount is unchanged.
 933  */
 934
 935 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 936                                       unsigned short mask)
 937 {
 938         struct net_device *dev, *ret;
 939
 940         ASSERT_RTNL();
 941
 942         ret = NULL;
 943         for_each_netdev(net, dev) {
 944                 if (((dev->flags ^ if_flags) & mask) == 0) {
 945                         ret = dev;
 946                         break;
 947                 }
 948         }
 949         return ret;
 950 }
 951 EXPORT_SYMBOL(__dev_get_by_flags);
 952
 953 /**
 954  *      dev_valid_name - check if name is okay for network device
 955  *      @name: name string
 956  *
 957  *      Network device names need to be valid file names to
 958  *      to allow sysfs to work.  We also disallow any kind of
 959  *      whitespace.
 960  */
 961 bool dev_valid_name(const char *name)
 962 {
 963         if (*name == '\0')
 964                 return false;
 965         if (strlen(name) >= IFNAMSIZ)
 966                 return false;
 967         if (!strcmp(name, ".") || !strcmp(name, ".."))
 968                 return false;
 969
 970         while (*name) {
 971                 if (*name == '/' || *name == ':' || isspace(*name))
 972                         return false;
 973                 name++;
 974         }
 975         return true;
 976 }
 977 EXPORT_SYMBOL(dev_valid_name);
 978
 979 /**
 980  *      __dev_alloc_name - allocate a name for a device
 981  *      @net: network namespace to allocate the device name in
 982  *      @name: name format string
 983  *      @buf:  scratch buffer and result name string
 984  *
 985  *      Passed a format string - eg "lt%d" it will try and find a suitable
 986  *      id. It scans list of devices to build up a free map, then chooses
 987  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 988  *      while allocating the name and adding the device in order to avoid
 989  *      duplicates.
 990  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 991  *      Returns the number of the unit assigned or a negative errno code.
 992  */
 993
 994 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 995 {
 996         int i = 0;
 997         const char *p;
 998         const int max_netdevices = 8*PAGE_SIZE;
 999         unsigned long *inuse;
1000         struct net_device *d;
1001
1002         p = strnchr(name, IFNAMSIZ-1, '%');
1003         if (p) {
1004                 /*
1005                  * Verify the string as this thing may have come from
1006                  * the user.  There must be either one "%d" and no other "%"
1007                  * characters.
1008                  */
1009                 if (p[1] != 'd' || strchr(p + 2, '%'))
1010                         return -EINVAL;
1011
1012                 /* Use one page as a bit array of possible slots */
1013                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1014                 if (!inuse)
1015                         return -ENOMEM;
1016
1017                 for_each_netdev(net, d) {
1018                         if (!sscanf(d->name, name, &i))
1019                                 continue;
1020                         if (i < 0 || i >= max_netdevices)
1021                                 continue;
1022
1023                         /*  avoid cases where sscanf is not exact inverse of printf */
1024                         snprintf(buf, IFNAMSIZ, name, i);
1025                         if (!strncmp(buf, d->name, IFNAMSIZ))
1026                                 set_bit(i, inuse);
1027                 }
1028
1029                 i = find_first_zero_bit(inuse, max_netdevices);
1030                 free_page((unsigned long) inuse);
1031         }
1032
1033         if (buf != name)
1034                 snprintf(buf, IFNAMSIZ, name, i);
1035         if (!__dev_get_by_name(net, buf))
1036                 return i;
1037
1038         /* It is possible to run out of possible slots
1039          * when the name is long and there isn't enough space left
1040          * for the digits, or if all bits are used.
1041          */
1042         return -ENFILE;
1043 }
1044
1045 /**
1046  *      dev_alloc_name - allocate a name for a device
1047  *      @dev: device
1048  *      @name: name format string
1049  *
1050  *      Passed a format string - eg "lt%d" it will try and find a suitable
1051  *      id. It scans list of devices to build up a free map, then chooses
1052  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1053  *      while allocating the name and adding the device in order to avoid
1054  *      duplicates.
1055  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1056  *      Returns the number of the unit assigned or a negative errno code.
1057  */
1058
1059 int dev_alloc_name(struct net_device *dev, const char *name)
1060 {
1061         char buf[IFNAMSIZ];
1062         struct net *net;
1063         int ret;
1064
1065         BUG_ON(!dev_net(dev));
1066         net = dev_net(dev);
1067         ret = __dev_alloc_name(net, name, buf);
1068         if (ret >= 0)
1069                 strlcpy(dev->name, buf, IFNAMSIZ);
1070         return ret;
1071 }
1072 EXPORT_SYMBOL(dev_alloc_name);
1073
1074 static int dev_alloc_name_ns(struct net *net,
1075                              struct net_device *dev,
1076                              const char *name)
1077 {
1078         char buf[IFNAMSIZ];
1079         int ret;
1080
1081         ret = __dev_alloc_name(net, name, buf);
1082         if (ret >= 0)
1083                 strlcpy(dev->name, buf, IFNAMSIZ);
1084         return ret;
1085 }
1086
1087 static int dev_get_valid_name(struct net *net,
1088                               struct net_device *dev,
1089                               const char *name)
1090 {
1091         BUG_ON(!net);
1092
1093         if (!dev_valid_name(name))
1094                 return -EINVAL;
1095
1096         if (strchr(name, '%'))
1097                 return dev_alloc_name_ns(net, dev, name);
1098         else if (__dev_get_by_name(net, name))
1099                 return -EEXIST;
1100         else if (dev->name != name)
1101                 strlcpy(dev->name, name, IFNAMSIZ);
1102
1103         return 0;
1104 }
1105
1106 /**
1107  *      dev_change_name - change name of a device
1108  *      @dev: device
1109  *      @newname: name (or format string) must be at least IFNAMSIZ
1110  *
1111  *      Change name of a device, can pass format strings "eth%d".
1112  *      for wildcarding.
1113  */
1114 int dev_change_name(struct net_device *dev, const char *newname)
1115 {
1116         unsigned char old_assign_type;
1117         char oldname[IFNAMSIZ];
1118         int err = 0;
1119         int ret;
1120         struct net *net;
1121
1122         ASSERT_RTNL();
1123         BUG_ON(!dev_net(dev));
1124
1125         net = dev_net(dev);
1126         if (dev->flags & IFF_UP)
1127                 return -EBUSY;
1128
1129         write_seqcount_begin(&devnet_rename_seq);
1130
1131         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1132                 write_seqcount_end(&devnet_rename_seq);
1133                 return 0;
1134         }
1135
1136         memcpy(oldname, dev->name, IFNAMSIZ);
1137
1138         err = dev_get_valid_name(net, dev, newname);
1139         if (err < 0) {
1140                 write_seqcount_end(&devnet_rename_seq);
1141                 return err;
1142         }
1143
1144         if (oldname[0] && !strchr(oldname, '%'))
1145                 netdev_info(dev, "renamed from %s\n", oldname);
1146
1147         old_assign_type = dev->name_assign_type;
1148         dev->name_assign_type = NET_NAME_RENAMED;
1149
1150 rollback:
1151         ret = device_rename(&dev->dev, dev->name);
1152         if (ret) {
1153                 memcpy(dev->name, oldname, IFNAMSIZ);
1154                 dev->name_assign_type = old_assign_type;
1155                 write_seqcount_end(&devnet_rename_seq);
1156                 return ret;
1157         }
1158
1159         write_seqcount_end(&devnet_rename_seq);
1160
1161         netdev_adjacent_rename_links(dev, oldname);
1162
1163         write_lock_bh(&dev_base_lock);
1164         hlist_del_rcu(&dev->name_hlist);
1165         write_unlock_bh(&dev_base_lock);
1166
1167         synchronize_rcu();
1168
1169         write_lock_bh(&dev_base_lock);
1170         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1171         write_unlock_bh(&dev_base_lock);
1172
1173         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1174         ret = notifier_to_errno(ret);
1175
1176         if (ret) {
1177                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1178                 if (err >= 0) {
1179                         err = ret;
1180                         write_seqcount_begin(&devnet_rename_seq);
1181                         memcpy(dev->name, oldname, IFNAMSIZ);
1182                         memcpy(oldname, newname, IFNAMSIZ);
1183                         dev->name_assign_type = old_assign_type;
1184                         old_assign_type = NET_NAME_RENAMED;
1185                         goto rollback;
1186                 } else {
1187                         pr_err("%s: name change rollback failed: %d\n",
1188                                dev->name, ret);
1189                 }
1190         }
1191
1192         return err;
1193 }
1194
1195 /**
1196  *      dev_set_alias - change ifalias of a device
1197  *      @dev: device
1198  *      @alias: name up to IFALIASZ
1199  *      @len: limit of bytes to copy from info
1200  *
1201  *      Set ifalias for a device,
1202  */
1203 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1204 {
1205         char *new_ifalias;
1206
1207         ASSERT_RTNL();
1208
1209         if (len >= IFALIASZ)
1210                 return -EINVAL;
1211
1212         if (!len) {
1213                 kfree(dev->ifalias);
1214                 dev->ifalias = NULL;
1215                 return 0;
1216         }
1217
1218         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1219         if (!new_ifalias)
1220                 return -ENOMEM;
1221         dev->ifalias = new_ifalias;
1222
1223         strlcpy(dev->ifalias, alias, len+1);
1224         return len;
1225 }
1226
1227
1228 /**
1229  *      netdev_features_change - device changes features
1230  *      @dev: device to cause notification
1231  *
1232  *      Called to indicate a device has changed features.
1233  */
1234 void netdev_features_change(struct net_device *dev)
1235 {
1236         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1237 }
1238 EXPORT_SYMBOL(netdev_features_change);
1239
1240 /**
1241  *      netdev_state_change - device changes state
1242  *      @dev: device to cause notification
1243  *
1244  *      Called to indicate a device has changed state. This function calls
1245  *      the notifier chains for netdev_chain and sends a NEWLINK message
1246  *      to the routing socket.
1247  */
1248 void netdev_state_change(struct net_device *dev)
1249 {
1250         if (dev->flags & IFF_UP) {
1251                 struct netdev_notifier_change_info change_info;
1252
1253                 change_info.flags_changed = 0;
1254                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1255                                               &change_info.info);
1256                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1257         }
1258 }
1259 EXPORT_SYMBOL(netdev_state_change);
1260
1261 /**
1262  *      netdev_notify_peers - notify network peers about existence of @dev
1263  *      @dev: network device
1264  *
1265  * Generate traffic such that interested network peers are aware of
1266  * @dev, such as by generating a gratuitous ARP. This may be used when
1267  * a device wants to inform the rest of the network about some sort of
1268  * reconfiguration such as a failover event or virtual machine
1269  * migration.
1270  */
1271 void netdev_notify_peers(struct net_device *dev)
1272 {
1273         rtnl_lock();
1274         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1275         rtnl_unlock();
1276 }
1277 EXPORT_SYMBOL(netdev_notify_peers);
1278
1279 static int __dev_open(struct net_device *dev)
1280 {
1281         const struct net_device_ops *ops = dev->netdev_ops;
1282         int ret;
1283
1284         ASSERT_RTNL();
1285
1286         if (!netif_device_present(dev))
1287                 return -ENODEV;
1288
1289         /* Block netpoll from trying to do any rx path servicing.
1290          * If we don't do this there is a chance ndo_poll_controller
1291          * or ndo_poll may be running while we open the device
1292          */
1293         netpoll_poll_disable(dev);
1294
1295         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1296         ret = notifier_to_errno(ret);
1297         if (ret)
1298                 return ret;
1299
1300         set_bit(__LINK_STATE_START, &dev->state);
1301
1302         if (ops->ndo_validate_addr)
1303                 ret = ops->ndo_validate_addr(dev);
1304
1305         if (!ret && ops->ndo_open)
1306                 ret = ops->ndo_open(dev);
1307
1308         netpoll_poll_enable(dev);
1309
1310         if (ret)
1311                 clear_bit(__LINK_STATE_START, &dev->state);
1312         else {
1313                 dev->flags |= IFF_UP;
1314                 dev_set_rx_mode(dev);
1315                 dev_activate(dev);
1316                 add_device_randomness(dev->dev_addr, dev->addr_len);
1317         }
1318
1319         return ret;
1320 }
1321
1322 /**
1323  *      dev_open        - prepare an interface for use.
1324  *      @dev:   device to open
1325  *
1326  *      Takes a device from down to up state. The device's private open
1327  *      function is invoked and then the multicast lists are loaded. Finally
1328  *      the device is moved into the up state and a %NETDEV_UP message is
1329  *      sent to the netdev notifier chain.
1330  *
1331  *      Calling this function on an active interface is a nop. On a failure
1332  *      a negative errno code is returned.
1333  */
1334 int dev_open(struct net_device *dev)
1335 {
1336         int ret;
1337
1338         if (dev->flags & IFF_UP)
1339                 return 0;
1340
1341         ret = __dev_open(dev);
1342         if (ret < 0)
1343                 return ret;
1344
1345         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1346         call_netdevice_notifiers(NETDEV_UP, dev);
1347
1348         return ret;
1349 }
1350 EXPORT_SYMBOL(dev_open);
1351
1352 static int __dev_close_many(struct list_head *head)
1353 {
1354         struct net_device *dev;
1355
1356         ASSERT_RTNL();
1357         might_sleep();
1358
1359         list_for_each_entry(dev, head, close_list) {
1360                 /* Temporarily disable netpoll until the interface is down */
1361                 netpoll_poll_disable(dev);
1362
1363                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1364
1365                 clear_bit(__LINK_STATE_START, &dev->state);
1366
1367                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1368                  * can be even on different cpu. So just clear netif_running().
1369                  *
1370                  * dev->stop() will invoke napi_disable() on all of it's
1371                  * napi_struct instances on this device.
1372                  */
1373                 smp_mb__after_atomic(); /* Commit netif_running(). */
1374         }
1375
1376         dev_deactivate_many(head);
1377
1378         list_for_each_entry(dev, head, close_list) {
1379                 const struct net_device_ops *ops = dev->netdev_ops;
1380
1381                 /*
1382                  *      Call the device specific close. This cannot fail.
1383                  *      Only if device is UP
1384                  *
1385                  *      We allow it to be called even after a DETACH hot-plug
1386                  *      event.
1387                  */
1388                 if (ops->ndo_stop)
1389                         ops->ndo_stop(dev);
1390
1391                 dev->flags &= ~IFF_UP;
1392                 netpoll_poll_enable(dev);
1393         }
1394
1395         return 0;
1396 }
1397
1398 static int __dev_close(struct net_device *dev)
1399 {
1400         int retval;
1401         LIST_HEAD(single);
1402
1403         list_add(&dev->close_list, &single);
1404         retval = __dev_close_many(&single);
1405         list_del(&single);
1406
1407         return retval;
1408 }
1409
1410 int dev_close_many(struct list_head *head, bool unlink)
1411 {
1412         struct net_device *dev, *tmp;
1413
1414         /* Remove the devices that don't need to be closed */
1415         list_for_each_entry_safe(dev, tmp, head, close_list)
1416                 if (!(dev->flags & IFF_UP))
1417                         list_del_init(&dev->close_list);
1418
1419         __dev_close_many(head);
1420
1421         list_for_each_entry_safe(dev, tmp, head, close_list) {
1422                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1423                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1424                 if (unlink)
1425                         list_del_init(&dev->close_list);
1426         }
1427
1428         return 0;
1429 }
1430 EXPORT_SYMBOL(dev_close_many);
1431
1432 /**
1433  *      dev_close - shutdown an interface.
1434  *      @dev: device to shutdown
1435  *
1436  *      This function moves an active device into down state. A
1437  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1438  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1439  *      chain.
1440  */
1441 int dev_close(struct net_device *dev)
1442 {
1443         if (dev->flags & IFF_UP) {
1444                 LIST_HEAD(single);
1445
1446                 list_add(&dev->close_list, &single);
1447                 dev_close_many(&single, true);
1448                 list_del(&single);
1449         }
1450         return 0;
1451 }
1452 EXPORT_SYMBOL(dev_close);
1453
1454
1455 /**
1456  *      dev_disable_lro - disable Large Receive Offload on a device
1457  *      @dev: device
1458  *
1459  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1460  *      called under RTNL.  This is needed if received packets may be
1461  *      forwarded to another interface.
1462  */
1463 void dev_disable_lro(struct net_device *dev)
1464 {
1465         struct net_device *lower_dev;
1466         struct list_head *iter;
1467
1468         dev->wanted_features &= ~NETIF_F_LRO;
1469         netdev_update_features(dev);
1470
1471         if (unlikely(dev->features & NETIF_F_LRO))
1472                 netdev_WARN(dev, "failed to disable LRO!\n");
1473
1474         netdev_for_each_lower_dev(dev, lower_dev, iter)
1475                 dev_disable_lro(lower_dev);
1476 }
1477 EXPORT_SYMBOL(dev_disable_lro);
1478
1479 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1480                                    struct net_device *dev)
1481 {
1482         struct netdev_notifier_info info;
1483
1484         netdev_notifier_info_init(&info, dev);
1485         return nb->notifier_call(nb, val, &info);
1486 }
1487
1488 static int dev_boot_phase = 1;
1489
1490 /**
1491  *      register_netdevice_notifier - register a network notifier block
1492  *      @nb: notifier
1493  *
1494  *      Register a notifier to be called when network device events occur.
1495  *      The notifier passed is linked into the kernel structures and must
1496  *      not be reused until it has been unregistered. A negative errno code
1497  *      is returned on a failure.
1498  *
1499  *      When registered all registration and up events are replayed
1500  *      to the new notifier to allow device to have a race free
1501  *      view of the network device list.
1502  */
1503
1504 int register_netdevice_notifier(struct notifier_block *nb)
1505 {
1506         struct net_device *dev;
1507         struct net_device *last;
1508         struct net *net;
1509         int err;
1510
1511         rtnl_lock();
1512         err = raw_notifier_chain_register(&netdev_chain, nb);
1513         if (err)
1514                 goto unlock;
1515         if (dev_boot_phase)
1516                 goto unlock;
1517         for_each_net(net) {
1518                 for_each_netdev(net, dev) {
1519                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1520                         err = notifier_to_errno(err);
1521                         if (err)
1522                                 goto rollback;
1523
1524                         if (!(dev->flags & IFF_UP))
1525                                 continue;
1526
1527                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1528                 }
1529         }
1530
1531 unlock:
1532         rtnl_unlock();
1533         return err;
1534
1535 rollback:
1536         last = dev;
1537         for_each_net(net) {
1538                 for_each_netdev(net, dev) {
1539                         if (dev == last)
1540                                 goto outroll;
1541
1542                         if (dev->flags & IFF_UP) {
1543                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1544                                                         dev);
1545                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1546                         }
1547                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1548                 }
1549         }
1550
1551 outroll:
1552         raw_notifier_chain_unregister(&netdev_chain, nb);
1553         goto unlock;
1554 }
1555 EXPORT_SYMBOL(register_netdevice_notifier);
1556
1557 /**
1558  *      unregister_netdevice_notifier - unregister a network notifier block
1559  *      @nb: notifier
1560  *
1561  *      Unregister a notifier previously registered by
1562  *      register_netdevice_notifier(). The notifier is unlinked into the
1563  *      kernel structures and may then be reused. A negative errno code
1564  *      is returned on a failure.
1565  *
1566  *      After unregistering unregister and down device events are synthesized
1567  *      for all devices on the device list to the removed notifier to remove
1568  *      the need for special case cleanup code.
1569  */
1570
1571 int unregister_netdevice_notifier(struct notifier_block *nb)
1572 {
1573         struct net_device *dev;
1574         struct net *net;
1575         int err;
1576
1577         rtnl_lock();
1578         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1579         if (err)
1580                 goto unlock;
1581
1582         for_each_net(net) {
1583                 for_each_netdev(net, dev) {
1584                         if (dev->flags & IFF_UP) {
1585                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1586                                                         dev);
1587                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1588                         }
1589                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1590                 }
1591         }
1592 unlock:
1593         rtnl_unlock();
1594         return err;
1595 }
1596 EXPORT_SYMBOL(unregister_netdevice_notifier);
1597
1598 /**
1599  *      call_netdevice_notifiers_info - call all network notifier blocks
1600  *      @val: value passed unmodified to notifier function
1601  *      @dev: net_device pointer passed unmodified to notifier function
1602  *      @info: notifier information data
1603  *
1604  *      Call all network notifier blocks.  Parameters and return value
1605  *      are as for raw_notifier_call_chain().
1606  */
1607
1608 static int call_netdevice_notifiers_info(unsigned long val,
1609                                          struct net_device *dev,
1610                                          struct netdev_notifier_info *info)
1611 {
1612         ASSERT_RTNL();
1613         netdev_notifier_info_init(info, dev);
1614         return raw_notifier_call_chain(&netdev_chain, val, info);
1615 }
1616
1617 /**
1618  *      call_netdevice_notifiers - call all network notifier blocks
1619  *      @val: value passed unmodified to notifier function
1620  *      @dev: net_device pointer passed unmodified to notifier function
1621  *
1622  *      Call all network notifier blocks.  Parameters and return value
1623  *      are as for raw_notifier_call_chain().
1624  */
1625
1626 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1627 {
1628         struct netdev_notifier_info info;
1629
1630         return call_netdevice_notifiers_info(val, dev, &info);
1631 }
1632 EXPORT_SYMBOL(call_netdevice_notifiers);
1633
1634 #ifdef CONFIG_NET_INGRESS
1635 static struct static_key ingress_needed __read_mostly;
1636
1637 void net_inc_ingress_queue(void)
1638 {
1639         static_key_slow_inc(&ingress_needed);
1640 }
1641 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1642
1643 void net_dec_ingress_queue(void)
1644 {
1645         static_key_slow_dec(&ingress_needed);
1646 }
1647 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1648 #endif
1649
1650 static struct static_key netstamp_needed __read_mostly;
1651 #ifdef HAVE_JUMP_LABEL
1652 /* We are not allowed to call static_key_slow_dec() from irq context
1653  * If net_disable_timestamp() is called from irq context, defer the
1654  * static_key_slow_dec() calls.
1655  */
1656 static atomic_t netstamp_needed_deferred;
1657 #endif
1658
1659 void net_enable_timestamp(void)
1660 {
1661 #ifdef HAVE_JUMP_LABEL
1662         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1663
1664         if (deferred) {
1665                 while (--deferred)
1666                         static_key_slow_dec(&netstamp_needed);
1667                 return;
1668         }
1669 #endif
1670         static_key_slow_inc(&netstamp_needed);
1671 }
1672 EXPORT_SYMBOL(net_enable_timestamp);
1673
1674 void net_disable_timestamp(void)
1675 {
1676 #ifdef HAVE_JUMP_LABEL
1677         if (in_interrupt()) {
1678                 atomic_inc(&netstamp_needed_deferred);
1679                 return;
1680         }
1681 #endif
1682         static_key_slow_dec(&netstamp_needed);
1683 }
1684 EXPORT_SYMBOL(net_disable_timestamp);
1685
1686 static inline void net_timestamp_set(struct sk_buff *skb)
1687 {
1688         skb->tstamp.tv64 = 0;
1689         if (static_key_false(&netstamp_needed))
1690                 __net_timestamp(skb);
1691 }
1692
1693 #define net_timestamp_check(COND, SKB)                  \
1694         if (static_key_false(&netstamp_needed)) {               \
1695                 if ((COND) && !(SKB)->tstamp.tv64)      \
1696                         __net_timestamp(SKB);           \
1697         }                                               \
1698
1699 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1700 {
1701         unsigned int len;
1702
1703         if (!(dev->flags & IFF_UP))
1704                 return false;
1705
1706         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1707         if (skb->len <= len)
1708                 return true;
1709
1710         /* if TSO is enabled, we don't care about the length as the packet
1711          * could be forwarded without being segmented before
1712          */
1713         if (skb_is_gso(skb))
1714                 return true;
1715
1716         return false;
1717 }
1718 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1719
1720 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1721 {
1722         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1723             unlikely(!is_skb_forwardable(dev, skb))) {
1724                 atomic_long_inc(&dev->rx_dropped);
1725                 kfree_skb(skb);
1726                 return NET_RX_DROP;
1727         }
1728
1729         skb_scrub_packet(skb, true);
1730         skb->priority = 0;
1731         skb->protocol = eth_type_trans(skb, dev);
1732         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1733
1734         return 0;
1735 }
1736 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1737
1738 /**
1739  * dev_forward_skb - loopback an skb to another netif
1740  *
1741  * @dev: destination network device
1742  * @skb: buffer to forward
1743  *
1744  * return values:
1745  *      NET_RX_SUCCESS  (no congestion)
1746  *      NET_RX_DROP     (packet was dropped, but freed)
1747  *
1748  * dev_forward_skb can be used for injecting an skb from the
1749  * start_xmit function of one device into the receive queue
1750  * of another device.
1751  *
1752  * The receiving device may be in another namespace, so
1753  * we have to clear all information in the skb that could
1754  * impact namespace isolation.
1755  */
1756 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1757 {
1758         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1759 }
1760 EXPORT_SYMBOL_GPL(dev_forward_skb);
1761
1762 static inline int deliver_skb(struct sk_buff *skb,
1763                               struct packet_type *pt_prev,
1764                               struct net_device *orig_dev)
1765 {
1766         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1767                 return -ENOMEM;
1768         atomic_inc(&skb->users);
1769         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1770 }
1771
1772 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1773                                           struct packet_type **pt,
1774                                           struct net_device *orig_dev,
1775                                           __be16 type,
1776                                           struct list_head *ptype_list)
1777 {
1778         struct packet_type *ptype, *pt_prev = *pt;
1779
1780         list_for_each_entry_rcu(ptype, ptype_list, list) {
1781                 if (ptype->type != type)
1782                         continue;
1783                 if (pt_prev)
1784                         deliver_skb(skb, pt_prev, orig_dev);
1785                 pt_prev = ptype;
1786         }
1787         *pt = pt_prev;
1788 }
1789
1790 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1791 {
1792         if (!ptype->af_packet_priv || !skb->sk)
1793                 return false;
1794
1795         if (ptype->id_match)
1796                 return ptype->id_match(ptype, skb->sk);
1797         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1798                 return true;
1799
1800         return false;
1801 }
1802
1803 /*
1804  *      Support routine. Sends outgoing frames to any network
1805  *      taps currently in use.
1806  */
1807
1808 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1809 {
1810         struct packet_type *ptype;
1811         struct sk_buff *skb2 = NULL;
1812         struct packet_type *pt_prev = NULL;
1813         struct list_head *ptype_list = &ptype_all;
1814
1815         rcu_read_lock();
1816 again:
1817         list_for_each_entry_rcu(ptype, ptype_list, list) {
1818                 /* Never send packets back to the socket
1819                  * they originated from - MvS (miquels@drinkel.ow.org)
1820                  */
1821                 if (skb_loop_sk(ptype, skb))
1822                         continue;
1823
1824                 if (pt_prev) {
1825                         deliver_skb(skb2, pt_prev, skb->dev);
1826                         pt_prev = ptype;
1827                         continue;
1828                 }
1829
1830                 /* need to clone skb, done only once */
1831                 skb2 = skb_clone(skb, GFP_ATOMIC);
1832                 if (!skb2)
1833                         goto out_unlock;
1834
1835                 net_timestamp_set(skb2);
1836
1837                 /* skb->nh should be correctly
1838                  * set by sender, so that the second statement is
1839                  * just protection against buggy protocols.
1840                  */
1841                 skb_reset_mac_header(skb2);
1842
1843                 if (skb_network_header(skb2) < skb2->data ||
1844                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1845                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1846                                              ntohs(skb2->protocol),
1847                                              dev->name);
1848                         skb_reset_network_header(skb2);
1849                 }
1850
1851                 skb2->transport_header = skb2->network_header;
1852                 skb2->pkt_type = PACKET_OUTGOING;
1853                 pt_prev = ptype;
1854         }
1855
1856         if (ptype_list == &ptype_all) {
1857                 ptype_list = &dev->ptype_all;
1858                 goto again;
1859         }
1860 out_unlock:
1861         if (pt_prev)
1862                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1863         rcu_read_unlock();
1864 }
1865
1866 /**
1867  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1868  * @dev: Network device
1869  * @txq: number of queues available
1870  *
1871  * If real_num_tx_queues is changed the tc mappings may no longer be
1872  * valid. To resolve this verify the tc mapping remains valid and if
1873  * not NULL the mapping. With no priorities mapping to this
1874  * offset/count pair it will no longer be used. In the worst case TC0
1875  * is invalid nothing can be done so disable priority mappings. If is
1876  * expected that drivers will fix this mapping if they can before
1877  * calling netif_set_real_num_tx_queues.
1878  */
1879 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1880 {
1881         int i;
1882         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1883
1884         /* If TC0 is invalidated disable TC mapping */
1885         if (tc->offset + tc->count > txq) {
1886                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1887                 dev->num_tc = 0;
1888                 return;
1889         }
1890
1891         /* Invalidated prio to tc mappings set to TC0 */
1892         for (i = 1; i < TC_BITMASK + 1; i++) {
1893                 int q = netdev_get_prio_tc_map(dev, i);
1894
1895                 tc = &dev->tc_to_txq[q];
1896                 if (tc->offset + tc->count > txq) {
1897                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1898                                 i, q);
1899                         netdev_set_prio_tc_map(dev, i, 0);
1900                 }
1901         }
1902 }
1903
1904 #ifdef CONFIG_XPS
1905 static DEFINE_MUTEX(xps_map_mutex);
1906 #define xmap_dereference(P)             \
1907         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1908
1909 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1910                                         int cpu, u16 index)
1911 {
1912         struct xps_map *map = NULL;
1913         int pos;
1914
1915         if (dev_maps)
1916                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1917
1918         for (pos = 0; map && pos < map->len; pos++) {
1919                 if (map->queues[pos] == index) {
1920                         if (map->len > 1) {
1921                                 map->queues[pos] = map->queues[--map->len];
1922                         } else {
1923                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1924                                 kfree_rcu(map, rcu);
1925                                 map = NULL;
1926                         }
1927                         break;
1928                 }
1929         }
1930
1931         return map;
1932 }
1933
1934 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1935 {
1936         struct xps_dev_maps *dev_maps;
1937         int cpu, i;
1938         bool active = false;
1939
1940         mutex_lock(&xps_map_mutex);
1941         dev_maps = xmap_dereference(dev->xps_maps);
1942
1943         if (!dev_maps)
1944                 goto out_no_maps;
1945
1946         for_each_possible_cpu(cpu) {
1947                 for (i = index; i < dev->num_tx_queues; i++) {
1948                         if (!remove_xps_queue(dev_maps, cpu, i))
1949                                 break;
1950                 }
1951                 if (i == dev->num_tx_queues)
1952                         active = true;
1953         }
1954
1955         if (!active) {
1956                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1957                 kfree_rcu(dev_maps, rcu);
1958         }
1959
1960         for (i = index; i < dev->num_tx_queues; i++)
1961                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1962                                              NUMA_NO_NODE);
1963
1964 out_no_maps:
1965         mutex_unlock(&xps_map_mutex);
1966 }
1967
1968 static struct xps_map *expand_xps_map(struct xps_map *map,
1969                                       int cpu, u16 index)
1970 {
1971         struct xps_map *new_map;
1972         int alloc_len = XPS_MIN_MAP_ALLOC;
1973         int i, pos;
1974
1975         for (pos = 0; map && pos < map->len; pos++) {
1976                 if (map->queues[pos] != index)
1977                         continue;
1978                 return map;
1979         }
1980
1981         /* Need to add queue to this CPU's existing map */
1982         if (map) {
1983                 if (pos < map->alloc_len)
1984                         return map;
1985
1986                 alloc_len = map->alloc_len * 2;
1987         }
1988
1989         /* Need to allocate new map to store queue on this CPU's map */
1990         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1991                                cpu_to_node(cpu));
1992         if (!new_map)
1993                 return NULL;
1994
1995         for (i = 0; i < pos; i++)
1996                 new_map->queues[i] = map->queues[i];
1997         new_map->alloc_len = alloc_len;
1998         new_map->len = pos;
1999
2000         return new_map;
2001 }
2002
2003 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2004                         u16 index)
2005 {
2006         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2007         struct xps_map *map, *new_map;
2008         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2009         int cpu, numa_node_id = -2;
2010         bool active = false;
2011
2012         mutex_lock(&xps_map_mutex);
2013
2014         dev_maps = xmap_dereference(dev->xps_maps);
2015
2016         /* allocate memory for queue storage */
2017         for_each_online_cpu(cpu) {
2018                 if (!cpumask_test_cpu(cpu, mask))
2019                         continue;
2020
2021                 if (!new_dev_maps)
2022                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2023                 if (!new_dev_maps) {
2024                         mutex_unlock(&xps_map_mutex);
2025                         return -ENOMEM;
2026                 }
2027
2028                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2029                                  NULL;
2030
2031                 map = expand_xps_map(map, cpu, index);
2032                 if (!map)
2033                         goto error;
2034
2035                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2036         }
2037
2038         if (!new_dev_maps)
2039                 goto out_no_new_maps;
2040
2041         for_each_possible_cpu(cpu) {
2042                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2043                         /* add queue to CPU maps */
2044                         int pos = 0;
2045
2046                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2047                         while ((pos < map->len) && (map->queues[pos] != index))
2048                                 pos++;
2049
2050                         if (pos == map->len)
2051                                 map->queues[map->len++] = index;
2052 #ifdef CONFIG_NUMA
2053                         if (numa_node_id == -2)
2054                                 numa_node_id = cpu_to_node(cpu);
2055                         else if (numa_node_id != cpu_to_node(cpu))
2056                                 numa_node_id = -1;
2057 #endif
2058                 } else if (dev_maps) {
2059                         /* fill in the new device map from the old device map */
2060                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2061                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2062                 }
2063
2064         }
2065
2066         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2067
2068         /* Cleanup old maps */
2069         if (dev_maps) {
2070                 for_each_possible_cpu(cpu) {
2071                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2072                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2073                         if (map && map != new_map)
2074                                 kfree_rcu(map, rcu);
2075                 }
2076
2077                 kfree_rcu(dev_maps, rcu);
2078         }
2079
2080         dev_maps = new_dev_maps;
2081         active = true;
2082
2083 out_no_new_maps:
2084         /* update Tx queue numa node */
2085         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2086                                      (numa_node_id >= 0) ? numa_node_id :
2087                                      NUMA_NO_NODE);
2088
2089         if (!dev_maps)
2090                 goto out_no_maps;
2091
2092         /* removes queue from unused CPUs */
2093         for_each_possible_cpu(cpu) {
2094                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2095                         continue;
2096
2097                 if (remove_xps_queue(dev_maps, cpu, index))
2098                         active = true;
2099         }
2100
2101         /* free map if not active */
2102         if (!active) {
2103                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2104                 kfree_rcu(dev_maps, rcu);
2105         }
2106
2107 out_no_maps:
2108         mutex_unlock(&xps_map_mutex);
2109
2110         return 0;
2111 error:
2112         /* remove any maps that we added */
2113         for_each_possible_cpu(cpu) {
2114                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2115                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2116                                  NULL;
2117                 if (new_map && new_map != map)
2118                         kfree(new_map);
2119         }
2120
2121         mutex_unlock(&xps_map_mutex);
2122
2123         kfree(new_dev_maps);
2124         return -ENOMEM;
2125 }
2126 EXPORT_SYMBOL(netif_set_xps_queue);
2127
2128 #endif
2129 /*
2130  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2131  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2132  */
2133 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2134 {
2135         int rc;
2136
2137         if (txq < 1 || txq > dev->num_tx_queues)
2138                 return -EINVAL;
2139
2140         if (dev->reg_state == NETREG_REGISTERED ||
2141             dev->reg_state == NETREG_UNREGISTERING) {
2142                 ASSERT_RTNL();
2143
2144                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2145                                                   txq);
2146                 if (rc)
2147                         return rc;
2148
2149                 if (dev->num_tc)
2150                         netif_setup_tc(dev, txq);
2151
2152                 if (txq < dev->real_num_tx_queues) {
2153                         qdisc_reset_all_tx_gt(dev, txq);
2154 #ifdef CONFIG_XPS
2155                         netif_reset_xps_queues_gt(dev, txq);
2156 #endif
2157                 }
2158         }
2159
2160         dev->real_num_tx_queues = txq;
2161         return 0;
2162 }
2163 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2164
2165 #ifdef CONFIG_SYSFS
2166 /**
2167  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2168  *      @dev: Network device
2169  *      @rxq: Actual number of RX queues
2170  *
2171  *      This must be called either with the rtnl_lock held or before
2172  *      registration of the net device.  Returns 0 on success, or a
2173  *      negative error code.  If called before registration, it always
2174  *      succeeds.
2175  */
2176 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2177 {
2178         int rc;
2179
2180         if (rxq < 1 || rxq > dev->num_rx_queues)
2181                 return -EINVAL;
2182
2183         if (dev->reg_state == NETREG_REGISTERED) {
2184                 ASSERT_RTNL();
2185
2186                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2187                                                   rxq);
2188                 if (rc)
2189                         return rc;
2190         }
2191
2192         dev->real_num_rx_queues = rxq;
2193         return 0;
2194 }
2195 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2196 #endif
2197
2198 /**
2199  * netif_get_num_default_rss_queues - default number of RSS queues
2200  *
2201  * This routine should set an upper limit on the number of RSS queues
2202  * used by default by multiqueue devices.
2203  */
2204 int netif_get_num_default_rss_queues(void)
2205 {
2206         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2207 }
2208 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2209
2210 static inline void __netif_reschedule(struct Qdisc *q)
2211 {
2212         struct softnet_data *sd;
2213         unsigned long flags;
2214
2215         local_irq_save(flags);
2216         sd = this_cpu_ptr(&softnet_data);
2217         q->next_sched = NULL;
2218         *sd->output_queue_tailp = q;
2219         sd->output_queue_tailp = &q->next_sched;
2220         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2221         local_irq_restore(flags);
2222 }
2223
2224 void __netif_schedule(struct Qdisc *q)
2225 {
2226         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2227                 __netif_reschedule(q);
2228 }
2229 EXPORT_SYMBOL(__netif_schedule);
2230
2231 struct dev_kfree_skb_cb {
2232         enum skb_free_reason reason;
2233 };
2234
2235 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2236 {
2237         return (struct dev_kfree_skb_cb *)skb->cb;
2238 }
2239
2240 void netif_schedule_queue(struct netdev_queue *txq)
2241 {
2242         rcu_read_lock();
2243         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2244                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2245
2246                 __netif_schedule(q);
2247         }
2248         rcu_read_unlock();
2249 }
2250 EXPORT_SYMBOL(netif_schedule_queue);
2251
2252 /**
2253  *      netif_wake_subqueue - allow sending packets on subqueue
2254  *      @dev: network device
2255  *      @queue_index: sub queue index
2256  *
2257  * Resume individual transmit queue of a device with multiple transmit queues.
2258  */
2259 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2260 {
2261         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2262
2263         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2264                 struct Qdisc *q;
2265
2266                 rcu_read_lock();
2267                 q = rcu_dereference(txq->qdisc);
2268                 __netif_schedule(q);
2269                 rcu_read_unlock();
2270         }
2271 }
2272 EXPORT_SYMBOL(netif_wake_subqueue);
2273
2274 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2275 {
2276         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2277                 struct Qdisc *q;
2278
2279                 rcu_read_lock();
2280                 q = rcu_dereference(dev_queue->qdisc);
2281                 __netif_schedule(q);
2282                 rcu_read_unlock();
2283         }
2284 }
2285 EXPORT_SYMBOL(netif_tx_wake_queue);
2286
2287 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2288 {
2289         unsigned long flags;
2290
2291         if (likely(atomic_read(&skb->users) == 1)) {
2292                 smp_rmb();
2293                 atomic_set(&skb->users, 0);
2294         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2295                 return;
2296         }
2297         get_kfree_skb_cb(skb)->reason = reason;
2298         local_irq_save(flags);
2299         skb->next = __this_cpu_read(softnet_data.completion_queue);
2300         __this_cpu_write(softnet_data.completion_queue, skb);
2301         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2302         local_irq_restore(flags);
2303 }
2304 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2305
2306 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2307 {
2308         if (in_irq() || irqs_disabled())
2309                 __dev_kfree_skb_irq(skb, reason);
2310         else
2311                 dev_kfree_skb(skb);
2312 }
2313 EXPORT_SYMBOL(__dev_kfree_skb_any);
2314
2315
2316 /**
2317  * netif_device_detach - mark device as removed
2318  * @dev: network device
2319  *
2320  * Mark device as removed from system and therefore no longer available.
2321  */
2322 void netif_device_detach(struct net_device *dev)
2323 {
2324         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2325             netif_running(dev)) {
2326                 netif_tx_stop_all_queues(dev);
2327         }
2328 }
2329 EXPORT_SYMBOL(netif_device_detach);
2330
2331 /**
2332  * netif_device_attach - mark device as attached
2333  * @dev: network device
2334  *
2335  * Mark device as attached from system and restart if needed.
2336  */
2337 void netif_device_attach(struct net_device *dev)
2338 {
2339         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2340             netif_running(dev)) {
2341                 netif_tx_wake_all_queues(dev);
2342                 __netdev_watchdog_up(dev);
2343         }
2344 }
2345 EXPORT_SYMBOL(netif_device_attach);
2346
2347 /*
2348  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2349  * to be used as a distribution range.
2350  */
2351 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2352                   unsigned int num_tx_queues)
2353 {
2354         u32 hash;
2355         u16 qoffset = 0;
2356         u16 qcount = num_tx_queues;
2357
2358         if (skb_rx_queue_recorded(skb)) {
2359                 hash = skb_get_rx_queue(skb);
2360                 while (unlikely(hash >= num_tx_queues))
2361                         hash -= num_tx_queues;
2362                 return hash;
2363         }
2364
2365         if (dev->num_tc) {
2366                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2367                 qoffset = dev->tc_to_txq[tc].offset;
2368                 qcount = dev->tc_to_txq[tc].count;
2369         }
2370
2371         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2372 }
2373 EXPORT_SYMBOL(__skb_tx_hash);
2374
2375 static void skb_warn_bad_offload(const struct sk_buff *skb)
2376 {
2377         static const netdev_features_t null_features = 0;
2378         struct net_device *dev = skb->dev;
2379         const char *driver = "";
2380
2381         if (!net_ratelimit())
2382                 return;
2383
2384         if (dev && dev->dev.parent)
2385                 driver = dev_driver_string(dev->dev.parent);
2386
2387         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2388              "gso_type=%d ip_summed=%d\n",
2389              driver, dev ? &dev->features : &null_features,
2390              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2391              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2392              skb_shinfo(skb)->gso_type, skb->ip_summed);
2393 }
2394
2395 /*
2396  * Invalidate hardware checksum when packet is to be mangled, and
2397  * complete checksum manually on outgoing path.
2398  */
2399 int skb_checksum_help(struct sk_buff *skb)
2400 {
2401         __wsum csum;
2402         int ret = 0, offset;
2403
2404         if (skb->ip_summed == CHECKSUM_COMPLETE)
2405                 goto out_set_summed;
2406
2407         if (unlikely(skb_shinfo(skb)->gso_size)) {
2408                 skb_warn_bad_offload(skb);
2409                 return -EINVAL;
2410         }
2411
2412         /* Before computing a checksum, we should make sure no frag could
2413          * be modified by an external entity : checksum could be wrong.
2414          */
2415         if (skb_has_shared_frag(skb)) {
2416                 ret = __skb_linearize(skb);
2417                 if (ret)
2418                         goto out;
2419         }
2420
2421         offset = skb_checksum_start_offset(skb);
2422         BUG_ON(offset >= skb_headlen(skb));
2423         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2424
2425         offset += skb->csum_offset;
2426         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2427
2428         if (skb_cloned(skb) &&
2429             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2430                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2431                 if (ret)
2432                         goto out;
2433         }
2434
2435         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2436 out_set_summed:
2437         skb->ip_summed = CHECKSUM_NONE;
2438 out:
2439         return ret;
2440 }
2441 EXPORT_SYMBOL(skb_checksum_help);
2442
2443 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2444 {
2445         __be16 type = skb->protocol;
2446
2447         /* Tunnel gso handlers can set protocol to ethernet. */
2448         if (type == htons(ETH_P_TEB)) {
2449                 struct ethhdr *eth;
2450
2451                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2452                         return 0;
2453
2454                 eth = (struct ethhdr *)skb_mac_header(skb);
2455                 type = eth->h_proto;
2456         }
2457
2458         return __vlan_get_protocol(skb, type, depth);
2459 }
2460
2461 /**
2462  *      skb_mac_gso_segment - mac layer segmentation handler.
2463  *      @skb: buffer to segment
2464  *      @features: features for the output path (see dev->features)
2465  */
2466 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2467                                     netdev_features_t features)
2468 {
2469         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2470         struct packet_offload *ptype;
2471         int vlan_depth = skb->mac_len;
2472         __be16 type = skb_network_protocol(skb, &vlan_depth);
2473
2474         if (unlikely(!type))
2475                 return ERR_PTR(-EINVAL);
2476
2477         __skb_pull(skb, vlan_depth);
2478
2479         rcu_read_lock();
2480         list_for_each_entry_rcu(ptype, &offload_base, list) {
2481                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2482                         segs = ptype->callbacks.gso_segment(skb, features);
2483                         break;
2484                 }
2485         }
2486         rcu_read_unlock();
2487
2488         __skb_push(skb, skb->data - skb_mac_header(skb));
2489
2490         return segs;
2491 }
2492 EXPORT_SYMBOL(skb_mac_gso_segment);
2493
2494
2495 /* openvswitch calls this on rx path, so we need a different check.
2496  */
2497 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2498 {
2499         if (tx_path)
2500                 return skb->ip_summed != CHECKSUM_PARTIAL;
2501         else
2502                 return skb->ip_summed == CHECKSUM_NONE;
2503 }
2504
2505 /**
2506  *      __skb_gso_segment - Perform segmentation on skb.
2507  *      @skb: buffer to segment
2508  *      @features: features for the output path (see dev->features)
2509  *      @tx_path: whether it is called in TX path
2510  *
2511  *      This function segments the given skb and returns a list of segments.
2512  *
2513  *      It may return NULL if the skb requires no segmentation.  This is
2514  *      only possible when GSO is used for verifying header integrity.
2515  */
2516 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2517                                   netdev_features_t features, bool tx_path)
2518 {
2519         if (unlikely(skb_needs_check(skb, tx_path))) {
2520                 int err;
2521
2522                 skb_warn_bad_offload(skb);
2523
2524                 err = skb_cow_head(skb, 0);
2525                 if (err < 0)
2526                         return ERR_PTR(err);
2527         }
2528
2529         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2530         SKB_GSO_CB(skb)->encap_level = 0;
2531
2532         skb_reset_mac_header(skb);
2533         skb_reset_mac_len(skb);
2534
2535         return skb_mac_gso_segment(skb, features);
2536 }
2537 EXPORT_SYMBOL(__skb_gso_segment);
2538
2539 /* Take action when hardware reception checksum errors are detected. */
2540 #ifdef CONFIG_BUG
2541 void netdev_rx_csum_fault(struct net_device *dev)
2542 {
2543         if (net_ratelimit()) {
2544                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2545                 dump_stack();
2546         }
2547 }
2548 EXPORT_SYMBOL(netdev_rx_csum_fault);
2549 #endif
2550
2551 /* Actually, we should eliminate this check as soon as we know, that:
2552  * 1. IOMMU is present and allows to map all the memory.
2553  * 2. No high memory really exists on this machine.
2554  */
2555
2556 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2557 {
2558 #ifdef CONFIG_HIGHMEM
2559         int i;
2560         if (!(dev->features & NETIF_F_HIGHDMA)) {
2561                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2562                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2563                         if (PageHighMem(skb_frag_page(frag)))
2564                                 return 1;
2565                 }
2566         }
2567
2568         if (PCI_DMA_BUS_IS_PHYS) {
2569                 struct device *pdev = dev->dev.parent;
2570
2571                 if (!pdev)
2572                         return 0;
2573                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2574                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2575                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2576                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2577                                 return 1;
2578                 }
2579         }
2580 #endif
2581         return 0;
2582 }
2583
2584 /* If MPLS offload request, verify we are testing hardware MPLS features
2585  * instead of standard features for the netdev.
2586  */
2587 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2588 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2589                                            netdev_features_t features,
2590                                            __be16 type)
2591 {
2592         if (eth_p_mpls(type))
2593                 features &= skb->dev->mpls_features;
2594
2595         return features;
2596 }
2597 #else
2598 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2599                                            netdev_features_t features,
2600                                            __be16 type)
2601 {
2602         return features;
2603 }
2604 #endif
2605
2606 static netdev_features_t harmonize_features(struct sk_buff *skb,
2607         netdev_features_t features)
2608 {
2609         int tmp;
2610         __be16 type;
2611
2612         type = skb_network_protocol(skb, &tmp);
2613         features = net_mpls_features(skb, features, type);
2614
2615         if (skb->ip_summed != CHECKSUM_NONE &&
2616             !can_checksum_protocol(features, type)) {
2617                 features &= ~NETIF_F_ALL_CSUM;
2618         } else if (illegal_highdma(skb->dev, skb)) {
2619                 features &= ~NETIF_F_SG;
2620         }
2621
2622         return features;
2623 }
2624
2625 netdev_features_t passthru_features_check(struct sk_buff *skb,
2626                                           struct net_device *dev,
2627                                           netdev_features_t features)
2628 {
2629         return features;
2630 }
2631 EXPORT_SYMBOL(passthru_features_check);
2632
2633 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2634                                              struct net_device *dev,
2635                                              netdev_features_t features)
2636 {
2637         return vlan_features_check(skb, features);
2638 }
2639
2640 netdev_features_t netif_skb_features(struct sk_buff *skb)
2641 {
2642         struct net_device *dev = skb->dev;
2643         netdev_features_t features = dev->features;
2644         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2645
2646         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2647                 features &= ~NETIF_F_GSO_MASK;
2648
2649         /* If encapsulation offload request, verify we are testing
2650          * hardware encapsulation features instead of standard
2651          * features for the netdev
2652          */
2653         if (skb->encapsulation)
2654                 features &= dev->hw_enc_features;
2655
2656         if (skb_vlan_tagged(skb))
2657                 features = netdev_intersect_features(features,
2658                                                      dev->vlan_features |
2659                                                      NETIF_F_HW_VLAN_CTAG_TX |
2660                                                      NETIF_F_HW_VLAN_STAG_TX);
2661
2662         if (dev->netdev_ops->ndo_features_check)
2663                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2664                                                                 features);
2665         else
2666                 features &= dflt_features_check(skb, dev, features);
2667
2668         return harmonize_features(skb, features);
2669 }
2670 EXPORT_SYMBOL(netif_skb_features);
2671
2672 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2673                     struct netdev_queue *txq, bool more)
2674 {
2675         unsigned int len;
2676         int rc;
2677
2678         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2679                 dev_queue_xmit_nit(skb, dev);
2680
2681         len = skb->len;
2682         trace_net_dev_start_xmit(skb, dev);
2683         rc = netdev_start_xmit(skb, dev, txq, more);
2684         trace_net_dev_xmit(skb, rc, dev, len);
2685
2686         return rc;
2687 }
2688
2689 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2690                                     struct netdev_queue *txq, int *ret)
2691 {
2692         struct sk_buff *skb = first;
2693         int rc = NETDEV_TX_OK;
2694
2695         while (skb) {
2696                 struct sk_buff *next = skb->next;
2697
2698                 skb->next = NULL;
2699                 rc = xmit_one(skb, dev, txq, next != NULL);
2700                 if (unlikely(!dev_xmit_complete(rc))) {
2701                         skb->next = next;
2702                         goto out;
2703                 }
2704
2705                 skb = next;
2706                 if (netif_xmit_stopped(txq) && skb) {
2707                         rc = NETDEV_TX_BUSY;
2708                         break;
2709                 }
2710         }
2711
2712 out:
2713         *ret = rc;
2714         return skb;
2715 }
2716
2717 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2718                                           netdev_features_t features)
2719 {
2720         if (skb_vlan_tag_present(skb) &&
2721             !vlan_hw_offload_capable(features, skb->vlan_proto))
2722                 skb = __vlan_hwaccel_push_inside(skb);
2723         return skb;
2724 }
2725
2726 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2727 {
2728         netdev_features_t features;
2729
2730         if (skb->next)
2731                 return skb;
2732
2733         features = netif_skb_features(skb);
2734         skb = validate_xmit_vlan(skb, features);
2735         if (unlikely(!skb))
2736                 goto out_null;
2737
2738         if (netif_needs_gso(skb, features)) {
2739                 struct sk_buff *segs;
2740
2741                 segs = skb_gso_segment(skb, features);
2742                 if (IS_ERR(segs)) {
2743                         goto out_kfree_skb;
2744                 } else if (segs) {
2745                         consume_skb(skb);
2746                         skb = segs;
2747                 }
2748         } else {
2749                 if (skb_needs_linearize(skb, features) &&
2750                     __skb_linearize(skb))
2751                         goto out_kfree_skb;
2752
2753                 /* If packet is not checksummed and device does not
2754                  * support checksumming for this protocol, complete
2755                  * checksumming here.
2756                  */
2757                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2758                         if (skb->encapsulation)
2759                                 skb_set_inner_transport_header(skb,
2760                                                                skb_checksum_start_offset(skb));
2761                         else
2762                                 skb_set_transport_header(skb,
2763                                                          skb_checksum_start_offset(skb));
2764                         if (!(features & NETIF_F_ALL_CSUM) &&
2765                             skb_checksum_help(skb))
2766                                 goto out_kfree_skb;
2767                 }
2768         }
2769
2770         return skb;
2771
2772 out_kfree_skb:
2773         kfree_skb(skb);
2774 out_null:
2775         return NULL;
2776 }
2777
2778 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2779 {
2780         struct sk_buff *next, *head = NULL, *tail;
2781
2782         for (; skb != NULL; skb = next) {
2783                 next = skb->next;
2784                 skb->next = NULL;
2785
2786                 /* in case skb wont be segmented, point to itself */
2787                 skb->prev = skb;
2788
2789                 skb = validate_xmit_skb(skb, dev);
2790                 if (!skb)
2791                         continue;
2792
2793                 if (!head)
2794                         head = skb;
2795                 else
2796                         tail->next = skb;
2797                 /* If skb was segmented, skb->prev points to
2798                  * the last segment. If not, it still contains skb.
2799                  */
2800                 tail = skb->prev;
2801         }
2802         return head;
2803 }
2804
2805 static void qdisc_pkt_len_init(struct sk_buff *skb)
2806 {
2807         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2808
2809         qdisc_skb_cb(skb)->pkt_len = skb->len;
2810
2811         /* To get more precise estimation of bytes sent on wire,
2812          * we add to pkt_len the headers size of all segments
2813          */
2814         if (shinfo->gso_size)  {
2815                 unsigned int hdr_len;
2816                 u16 gso_segs = shinfo->gso_segs;
2817
2818                 /* mac layer + network layer */
2819                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2820
2821                 /* + transport layer */
2822                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2823                         hdr_len += tcp_hdrlen(skb);
2824                 else
2825                         hdr_len += sizeof(struct udphdr);
2826
2827                 if (shinfo->gso_type & SKB_GSO_DODGY)
2828                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2829                                                 shinfo->gso_size);
2830
2831                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2832         }
2833 }
2834
2835 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2836                                  struct net_device *dev,
2837                                  struct netdev_queue *txq)
2838 {
2839         spinlock_t *root_lock = qdisc_lock(q);
2840         bool contended;
2841         int rc;
2842
2843         qdisc_pkt_len_init(skb);
2844         qdisc_calculate_pkt_len(skb, q);
2845         /*
2846          * Heuristic to force contended enqueues to serialize on a
2847          * separate lock before trying to get qdisc main lock.
2848          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2849          * often and dequeue packets faster.
2850          */
2851         contended = qdisc_is_running(q);
2852         if (unlikely(contended))
2853                 spin_lock(&q->busylock);
2854
2855         spin_lock(root_lock);
2856         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2857                 kfree_skb(skb);
2858                 rc = NET_XMIT_DROP;
2859         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2860                    qdisc_run_begin(q)) {
2861                 /*
2862                  * This is a work-conserving queue; there are no old skbs
2863                  * waiting to be sent out; and the qdisc is not running -
2864                  * xmit the skb directly.
2865                  */
2866
2867                 qdisc_bstats_update(q, skb);
2868
2869                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2870                         if (unlikely(contended)) {
2871                                 spin_unlock(&q->busylock);
2872                                 contended = false;
2873                         }
2874                         __qdisc_run(q);
2875                 } else
2876                         qdisc_run_end(q);
2877
2878                 rc = NET_XMIT_SUCCESS;
2879         } else {
2880                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2881                 if (qdisc_run_begin(q)) {
2882                         if (unlikely(contended)) {
2883                                 spin_unlock(&q->busylock);
2884                                 contended = false;
2885                         }
2886                         __qdisc_run(q);
2887                 }
2888         }
2889         spin_unlock(root_lock);
2890         if (unlikely(contended))
2891                 spin_unlock(&q->busylock);
2892         return rc;
2893 }
2894
2895 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2896 static void skb_update_prio(struct sk_buff *skb)
2897 {
2898         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2899
2900         if (!skb->priority && skb->sk && map) {
2901                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2902
2903                 if (prioidx < map->priomap_len)
2904                         skb->priority = map->priomap[prioidx];
2905         }
2906 }
2907 #else
2908 #define skb_update_prio(skb)
2909 #endif
2910
2911 DEFINE_PER_CPU(int, xmit_recursion);
2912 EXPORT_SYMBOL(xmit_recursion);
2913
2914 #define RECURSION_LIMIT 10
2915
2916 /**
2917  *      dev_loopback_xmit - loop back @skb
2918  *      @skb: buffer to transmit
2919  */
2920 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2921 {
2922         skb_reset_mac_header(skb);
2923         __skb_pull(skb, skb_network_offset(skb));
2924         skb->pkt_type = PACKET_LOOPBACK;
2925         skb->ip_summed = CHECKSUM_UNNECESSARY;
2926         WARN_ON(!skb_dst(skb));
2927         skb_dst_force(skb);
2928         netif_rx_ni(skb);
2929         return 0;
2930 }
2931 EXPORT_SYMBOL(dev_loopback_xmit);
2932
2933 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2934 {
2935 #ifdef CONFIG_XPS
2936         struct xps_dev_maps *dev_maps;
2937         struct xps_map *map;
2938         int queue_index = -1;
2939
2940         rcu_read_lock();
2941         dev_maps = rcu_dereference(dev->xps_maps);
2942         if (dev_maps) {
2943                 map = rcu_dereference(
2944                     dev_maps->cpu_map[skb->sender_cpu - 1]);
2945                 if (map) {
2946                         if (map->len == 1)
2947                                 queue_index = map->queues[0];
2948                         else
2949                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2950                                                                            map->len)];
2951                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2952                                 queue_index = -1;
2953                 }
2954         }
2955         rcu_read_unlock();
2956
2957         return queue_index;
2958 #else
2959         return -1;
2960 #endif
2961 }
2962
2963 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2964 {
2965         struct sock *sk = skb->sk;
2966         int queue_index = sk_tx_queue_get(sk);
2967
2968         if (queue_index < 0 || skb->ooo_okay ||
2969             queue_index >= dev->real_num_tx_queues) {
2970                 int new_index = get_xps_queue(dev, skb);
2971                 if (new_index < 0)
2972                         new_index = skb_tx_hash(dev, skb);
2973
2974                 if (queue_index != new_index && sk &&
2975                     rcu_access_pointer(sk->sk_dst_cache))
2976                         sk_tx_queue_set(sk, new_index);
2977
2978                 queue_index = new_index;
2979         }
2980
2981         return queue_index;
2982 }
2983
2984 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2985                                     struct sk_buff *skb,
2986                                     void *accel_priv)
2987 {
2988         int queue_index = 0;
2989
2990 #ifdef CONFIG_XPS
2991         if (skb->sender_cpu == 0)
2992                 skb->sender_cpu = raw_smp_processor_id() + 1;
2993 #endif
2994
2995         if (dev->real_num_tx_queues != 1) {
2996                 const struct net_device_ops *ops = dev->netdev_ops;
2997                 if (ops->ndo_select_queue)
2998                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
2999                                                             __netdev_pick_tx);
3000                 else
3001                         queue_index = __netdev_pick_tx(dev, skb);
3002
3003                 if (!accel_priv)
3004                         queue_index = netdev_cap_txqueue(dev, queue_index);
3005         }
3006
3007         skb_set_queue_mapping(skb, queue_index);
3008         return netdev_get_tx_queue(dev, queue_index);
3009 }
3010
3011 /**
3012  *      __dev_queue_xmit - transmit a buffer
3013  *      @skb: buffer to transmit
3014  *      @accel_priv: private data used for L2 forwarding offload
3015  *
3016  *      Queue a buffer for transmission to a network device. The caller must
3017  *      have set the device and priority and built the buffer before calling
3018  *      this function. The function can be called from an interrupt.
3019  *
3020  *      A negative errno code is returned on a failure. A success does not
3021  *      guarantee the frame will be transmitted as it may be dropped due
3022  *      to congestion or traffic shaping.
3023  *
3024  * -----------------------------------------------------------------------------------
3025  *      I notice this method can also return errors from the queue disciplines,
3026  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3027  *      be positive.
3028  *
3029  *      Regardless of the return value, the skb is consumed, so it is currently
3030  *      difficult to retry a send to this method.  (You can bump the ref count
3031  *      before sending to hold a reference for retry if you are careful.)
3032  *
3033  *      When calling this method, interrupts MUST be enabled.  This is because
3034  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3035  *          --BLG
3036  */
3037 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3038 {
3039         struct net_device *dev = skb->dev;
3040         struct netdev_queue *txq;
3041         struct Qdisc *q;
3042         int rc = -ENOMEM;
3043
3044         skb_reset_mac_header(skb);
3045
3046         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3047                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3048
3049         /* Disable soft irqs for various locks below. Also
3050          * stops preemption for RCU.
3051          */
3052         rcu_read_lock_bh();
3053
3054         skb_update_prio(skb);
3055
3056         /* If device/qdisc don't need skb->dst, release it right now while
3057          * its hot in this cpu cache.
3058          */
3059         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3060                 skb_dst_drop(skb);
3061         else
3062                 skb_dst_force(skb);
3063
3064         txq = netdev_pick_tx(dev, skb, accel_priv);
3065         q = rcu_dereference_bh(txq->qdisc);
3066
3067 #ifdef CONFIG_NET_CLS_ACT
3068         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3069 #endif
3070         trace_net_dev_queue(skb);
3071         if (q->enqueue) {
3072                 rc = __dev_xmit_skb(skb, q, dev, txq);
3073                 goto out;
3074         }
3075
3076         /* The device has no queue. Common case for software devices:
3077            loopback, all the sorts of tunnels...
3078
3079            Really, it is unlikely that netif_tx_lock protection is necessary
3080            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3081            counters.)
3082            However, it is possible, that they rely on protection
3083            made by us here.
3084
3085            Check this and shot the lock. It is not prone from deadlocks.
3086            Either shot noqueue qdisc, it is even simpler 8)
3087          */
3088         if (dev->flags & IFF_UP) {
3089                 int cpu = smp_processor_id(); /* ok because BHs are off */
3090
3091                 if (txq->xmit_lock_owner != cpu) {
3092
3093                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3094                                 goto recursion_alert;
3095
3096                         skb = validate_xmit_skb(skb, dev);
3097                         if (!skb)
3098                                 goto drop;
3099
3100                         HARD_TX_LOCK(dev, txq, cpu);
3101
3102                         if (!netif_xmit_stopped(txq)) {
3103                                 __this_cpu_inc(xmit_recursion);
3104                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3105                                 __this_cpu_dec(xmit_recursion);
3106                                 if (dev_xmit_complete(rc)) {
3107                                         HARD_TX_UNLOCK(dev, txq);
3108                                         goto out;
3109                                 }
3110                         }
3111                         HARD_TX_UNLOCK(dev, txq);
3112                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3113                                              dev->name);
3114                 } else {
3115                         /* Recursion is detected! It is possible,
3116                          * unfortunately
3117                          */
3118 recursion_alert:
3119                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3120                                              dev->name);
3121                 }
3122         }
3123
3124         rc = -ENETDOWN;
3125 drop:
3126         rcu_read_unlock_bh();
3127
3128         atomic_long_inc(&dev->tx_dropped);
3129         kfree_skb_list(skb);
3130         return rc;
3131 out:
3132         rcu_read_unlock_bh();
3133         return rc;
3134 }
3135
3136 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3137 {
3138         return __dev_queue_xmit(skb, NULL);
3139 }
3140 EXPORT_SYMBOL(dev_queue_xmit_sk);
3141
3142 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3143 {
3144         return __dev_queue_xmit(skb, accel_priv);
3145 }
3146 EXPORT_SYMBOL(dev_queue_xmit_accel);
3147
3148
3149 /*=======================================================================
3150                         Receiver routines
3151   =======================================================================*/
3152
3153 int netdev_max_backlog __read_mostly = 1000;
3154 EXPORT_SYMBOL(netdev_max_backlog);
3155
3156 int netdev_tstamp_prequeue __read_mostly = 1;
3157 int netdev_budget __read_mostly = 300;
3158 int weight_p __read_mostly = 64;            /* old backlog weight */
3159
3160 /* Called with irq disabled */
3161 static inline void ____napi_schedule(struct softnet_data *sd,
3162                                      struct napi_struct *napi)
3163 {
3164         list_add_tail(&napi->poll_list, &sd->poll_list);
3165         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3166 }
3167
3168 #ifdef CONFIG_RPS
3169
3170 /* One global table that all flow-based protocols share. */
3171 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3172 EXPORT_SYMBOL(rps_sock_flow_table);
3173 u32 rps_cpu_mask __read_mostly;
3174 EXPORT_SYMBOL(rps_cpu_mask);
3175
3176 struct static_key rps_needed __read_mostly;
3177
3178 static struct rps_dev_flow *
3179 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3180             struct rps_dev_flow *rflow, u16 next_cpu)
3181 {
3182         if (next_cpu < nr_cpu_ids) {
3183 #ifdef CONFIG_RFS_ACCEL
3184                 struct netdev_rx_queue *rxqueue;
3185                 struct rps_dev_flow_table *flow_table;
3186                 struct rps_dev_flow *old_rflow;
3187                 u32 flow_id;
3188                 u16 rxq_index;
3189                 int rc;
3190
3191                 /* Should we steer this flow to a different hardware queue? */
3192                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3193                     !(dev->features & NETIF_F_NTUPLE))
3194                         goto out;
3195                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3196                 if (rxq_index == skb_get_rx_queue(skb))
3197                         goto out;
3198
3199                 rxqueue = dev->_rx + rxq_index;
3200                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3201                 if (!flow_table)
3202                         goto out;
3203                 flow_id = skb_get_hash(skb) & flow_table->mask;
3204                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3205                                                         rxq_index, flow_id);
3206                 if (rc < 0)
3207                         goto out;
3208                 old_rflow = rflow;
3209                 rflow = &flow_table->flows[flow_id];
3210                 rflow->filter = rc;
3211                 if (old_rflow->filter == rflow->filter)
3212                         old_rflow->filter = RPS_NO_FILTER;
3213         out:
3214 #endif
3215                 rflow->last_qtail =
3216                         per_cpu(softnet_data, next_cpu).input_queue_head;
3217         }
3218
3219         rflow->cpu = next_cpu;
3220         return rflow;
3221 }
3222
3223 /*
3224  * get_rps_cpu is called from netif_receive_skb and returns the target
3225  * CPU from the RPS map of the receiving queue for a given skb.
3226  * rcu_read_lock must be held on entry.
3227  */
3228 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3229                        struct rps_dev_flow **rflowp)
3230 {
3231         const struct rps_sock_flow_table *sock_flow_table;
3232         struct netdev_rx_queue *rxqueue = dev->_rx;
3233         struct rps_dev_flow_table *flow_table;
3234         struct rps_map *map;
3235         int cpu = -1;
3236         u32 tcpu;
3237         u32 hash;
3238
3239         if (skb_rx_queue_recorded(skb)) {
3240                 u16 index = skb_get_rx_queue(skb);
3241
3242                 if (unlikely(index >= dev->real_num_rx_queues)) {
3243                         WARN_ONCE(dev->real_num_rx_queues > 1,
3244                                   "%s received packet on queue %u, but number "
3245                                   "of RX queues is %u\n",
3246                                   dev->name, index, dev->real_num_rx_queues);
3247                         goto done;
3248                 }
3249                 rxqueue += index;
3250         }
3251
3252         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3253
3254         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3255         map = rcu_dereference(rxqueue->rps_map);
3256         if (!flow_table && !map)
3257                 goto done;
3258
3259         skb_reset_network_header(skb);
3260         hash = skb_get_hash(skb);
3261         if (!hash)
3262                 goto done;
3263
3264         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3265         if (flow_table && sock_flow_table) {
3266                 struct rps_dev_flow *rflow;
3267                 u32 next_cpu;
3268                 u32 ident;
3269
3270                 /* First check into global flow table if there is a match */
3271                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3272                 if ((ident ^ hash) & ~rps_cpu_mask)
3273                         goto try_rps;
3274
3275                 next_cpu = ident & rps_cpu_mask;
3276
3277                 /* OK, now we know there is a match,
3278                  * we can look at the local (per receive queue) flow table
3279                  */
3280                 rflow = &flow_table->flows[hash & flow_table->mask];
3281                 tcpu = rflow->cpu;
3282
3283                 /*
3284                  * If the desired CPU (where last recvmsg was done) is
3285                  * different from current CPU (one in the rx-queue flow
3286                  * table entry), switch if one of the following holds:
3287                  *   - Current CPU is unset (>= nr_cpu_ids).
3288                  *   - Current CPU is offline.
3289                  *   - The current CPU's queue tail has advanced beyond the
3290                  *     last packet that was enqueued using this table entry.
3291                  *     This guarantees that all previous packets for the flow
3292                  *     have been dequeued, thus preserving in order delivery.
3293                  */
3294                 if (unlikely(tcpu != next_cpu) &&
3295                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3296                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3297                       rflow->last_qtail)) >= 0)) {
3298                         tcpu = next_cpu;
3299                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3300                 }
3301
3302                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3303                         *rflowp = rflow;
3304                         cpu = tcpu;
3305                         goto done;
3306                 }
3307         }
3308
3309 try_rps:
3310
3311         if (map) {
3312                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3313                 if (cpu_online(tcpu)) {
3314                         cpu = tcpu;
3315                         goto done;
3316                 }
3317         }
3318
3319 done:
3320         return cpu;
3321 }
3322
3323 #ifdef CONFIG_RFS_ACCEL
3324
3325 /**
3326  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3327  * @dev: Device on which the filter was set
3328  * @rxq_index: RX queue index
3329  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3330  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3331  *
3332  * Drivers that implement ndo_rx_flow_steer() should periodically call
3333  * this function for each installed filter and remove the filters for
3334  * which it returns %true.
3335  */
3336 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3337                          u32 flow_id, u16 filter_id)
3338 {
3339         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3340         struct rps_dev_flow_table *flow_table;
3341         struct rps_dev_flow *rflow;
3342         bool expire = true;
3343         unsigned int cpu;
3344
3345         rcu_read_lock();
3346         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3347         if (flow_table && flow_id <= flow_table->mask) {
3348                 rflow = &flow_table->flows[flow_id];
3349                 cpu = ACCESS_ONCE(rflow->cpu);
3350                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3351                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3352                            rflow->last_qtail) <
3353                      (int)(10 * flow_table->mask)))
3354                         expire = false;
3355         }
3356         rcu_read_unlock();
3357         return expire;
3358 }
3359 EXPORT_SYMBOL(rps_may_expire_flow);
3360
3361 #endif /* CONFIG_RFS_ACCEL */
3362
3363 /* Called from hardirq (IPI) context */
3364 static void rps_trigger_softirq(void *data)
3365 {
3366         struct softnet_data *sd = data;
3367
3368         ____napi_schedule(sd, &sd->backlog);
3369         sd->received_rps++;
3370 }
3371
3372 #endif /* CONFIG_RPS */
3373
3374 /*
3375  * Check if this softnet_data structure is another cpu one
3376  * If yes, queue it to our IPI list and return 1
3377  * If no, return 0
3378  */
3379 static int rps_ipi_queued(struct softnet_data *sd)
3380 {
3381 #ifdef CONFIG_RPS
3382         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3383
3384         if (sd != mysd) {
3385                 sd->rps_ipi_next = mysd->rps_ipi_list;
3386                 mysd->rps_ipi_list = sd;
3387
3388                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3389                 return 1;
3390         }
3391 #endif /* CONFIG_RPS */
3392         return 0;
3393 }
3394
3395 #ifdef CONFIG_NET_FLOW_LIMIT
3396 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3397 #endif
3398
3399 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3400 {
3401 #ifdef CONFIG_NET_FLOW_LIMIT
3402         struct sd_flow_limit *fl;
3403         struct softnet_data *sd;
3404         unsigned int old_flow, new_flow;
3405
3406         if (qlen < (netdev_max_backlog >> 1))
3407                 return false;
3408
3409         sd = this_cpu_ptr(&softnet_data);
3410
3411         rcu_read_lock();
3412         fl = rcu_dereference(sd->flow_limit);
3413         if (fl) {
3414                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3415                 old_flow = fl->history[fl->history_head];
3416                 fl->history[fl->history_head] = new_flow;
3417
3418                 fl->history_head++;
3419                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3420
3421                 if (likely(fl->buckets[old_flow]))
3422                         fl->buckets[old_flow]--;
3423
3424                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3425                         fl->count++;
3426                         rcu_read_unlock();
3427                         return true;
3428                 }
3429         }
3430         rcu_read_unlock();
3431 #endif
3432         return false;
3433 }
3434
3435 /*
3436  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3437  * queue (may be a remote CPU queue).
3438  */
3439 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3440                               unsigned int *qtail)
3441 {
3442         struct softnet_data *sd;
3443         unsigned long flags;
3444         unsigned int qlen;
3445
3446         sd = &per_cpu(softnet_data, cpu);
3447
3448         local_irq_save(flags);
3449
3450         rps_lock(sd);
3451         qlen = skb_queue_len(&sd->input_pkt_queue);
3452         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3453                 if (qlen) {
3454 enqueue:
3455                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3456                         input_queue_tail_incr_save(sd, qtail);
3457                         rps_unlock(sd);
3458                         local_irq_restore(flags);
3459                         return NET_RX_SUCCESS;
3460                 }
3461
3462                 /* Schedule NAPI for backlog device
3463                  * We can use non atomic operation since we own the queue lock
3464                  */
3465                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3466                         if (!rps_ipi_queued(sd))
3467                                 ____napi_schedule(sd, &sd->backlog);
3468                 }
3469                 goto enqueue;
3470         }
3471
3472         sd->dropped++;
3473         rps_unlock(sd);
3474
3475         local_irq_restore(flags);
3476
3477         atomic_long_inc(&skb->dev->rx_dropped);
3478         kfree_skb(skb);
3479         return NET_RX_DROP;
3480 }
3481
3482 static int netif_rx_internal(struct sk_buff *skb)
3483 {
3484         int ret;
3485
3486         net_timestamp_check(netdev_tstamp_prequeue, skb);
3487
3488         trace_netif_rx(skb);
3489 #ifdef CONFIG_RPS
3490         if (static_key_false(&rps_needed)) {
3491                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3492                 int cpu;
3493
3494                 preempt_disable();
3495                 rcu_read_lock();
3496
3497                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3498                 if (cpu < 0)
3499                         cpu = smp_processor_id();
3500
3501                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3502
3503                 rcu_read_unlock();
3504                 preempt_enable();
3505         } else
3506 #endif
3507         {
3508                 unsigned int qtail;
3509                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3510                 put_cpu();
3511         }
3512         return ret;
3513 }
3514
3515 /**
3516  *      netif_rx        -       post buffer to the network code
3517  *      @skb: buffer to post
3518  *
3519  *      This function receives a packet from a device driver and queues it for
3520  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3521  *      may be dropped during processing for congestion control or by the
3522  *      protocol layers.
3523  *
3524  *      return values:
3525  *      NET_RX_SUCCESS  (no congestion)
3526  *      NET_RX_DROP     (packet was dropped)
3527  *
3528  */
3529
3530 int netif_rx(struct sk_buff *skb)
3531 {
3532         trace_netif_rx_entry(skb);
3533
3534         return netif_rx_internal(skb);
3535 }
3536 EXPORT_SYMBOL(netif_rx);
3537
3538 int netif_rx_ni(struct sk_buff *skb)
3539 {
3540         int err;
3541
3542         trace_netif_rx_ni_entry(skb);
3543
3544         preempt_disable();
3545         err = netif_rx_internal(skb);
3546         if (local_softirq_pending())
3547                 do_softirq();
3548         preempt_enable();
3549
3550         return err;
3551 }
3552 EXPORT_SYMBOL(netif_rx_ni);
3553
3554 static void net_tx_action(struct softirq_action *h)
3555 {
3556         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3557
3558         if (sd->completion_queue) {
3559                 struct sk_buff *clist;
3560
3561                 local_irq_disable();
3562                 clist = sd->completion_queue;
3563                 sd->completion_queue = NULL;
3564                 local_irq_enable();
3565
3566                 while (clist) {
3567                         struct sk_buff *skb = clist;
3568                         clist = clist->next;
3569
3570                         WARN_ON(atomic_read(&skb->users));
3571                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3572                                 trace_consume_skb(skb);
3573                         else
3574                                 trace_kfree_skb(skb, net_tx_action);
3575                         __kfree_skb(skb);
3576                 }
3577         }
3578
3579         if (sd->output_queue) {
3580                 struct Qdisc *head;
3581
3582                 local_irq_disable();
3583                 head = sd->output_queue;
3584                 sd->output_queue = NULL;
3585                 sd->output_queue_tailp = &sd->output_queue;
3586                 local_irq_enable();
3587
3588                 while (head) {
3589                         struct Qdisc *q = head;
3590                         spinlock_t *root_lock;
3591
3592                         head = head->next_sched;
3593
3594                         root_lock = qdisc_lock(q);
3595                         if (spin_trylock(root_lock)) {
3596                                 smp_mb__before_atomic();
3597                                 clear_bit(__QDISC_STATE_SCHED,
3598                                           &q->state);
3599                                 qdisc_run(q);
3600                                 spin_unlock(root_lock);
3601                         } else {
3602                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3603                                               &q->state)) {
3604                                         __netif_reschedule(q);
3605                                 } else {
3606                                         smp_mb__before_atomic();
3607                                         clear_bit(__QDISC_STATE_SCHED,
3608                                                   &q->state);
3609                                 }
3610                         }
3611                 }
3612         }
3613 }
3614
3615 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3616     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3617 /* This hook is defined here for ATM LANE */
3618 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3619                              unsigned char *addr) __read_mostly;
3620 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3621 #endif
3622
3623 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3624                                          struct packet_type **pt_prev,
3625                                          int *ret, struct net_device *orig_dev)
3626 {
3627 #ifdef CONFIG_NET_CLS_ACT
3628         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3629         struct tcf_result cl_res;
3630
3631         /* If there's at least one ingress present somewhere (so
3632          * we get here via enabled static key), remaining devices
3633          * that are not configured with an ingress qdisc will bail
3634          * out here.
3635          */
3636         if (!cl)
3637                 return skb;
3638         if (*pt_prev) {
3639                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3640                 *pt_prev = NULL;
3641         }
3642
3643         qdisc_skb_cb(skb)->pkt_len = skb->len;
3644         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3645         qdisc_bstats_update_cpu(cl->q, skb);
3646
3647         switch (tc_classify(skb, cl, &cl_res)) {
3648         case TC_ACT_OK:
3649         case TC_ACT_RECLASSIFY:
3650                 skb->tc_index = TC_H_MIN(cl_res.classid);
3651                 break;
3652         case TC_ACT_SHOT:
3653                 qdisc_qstats_drop_cpu(cl->q);
3654         case TC_ACT_STOLEN:
3655         case TC_ACT_QUEUED:
3656                 kfree_skb(skb);
3657                 return NULL;
3658         default:
3659                 break;
3660         }
3661 #endif /* CONFIG_NET_CLS_ACT */
3662         return skb;
3663 }
3664
3665 /**
3666  *      netdev_rx_handler_register - register receive handler
3667  *      @dev: device to register a handler for
3668  *      @rx_handler: receive handler to register
3669  *      @rx_handler_data: data pointer that is used by rx handler
3670  *
3671  *      Register a receive handler for a device. This handler will then be
3672  *      called from __netif_receive_skb. A negative errno code is returned
3673  *      on a failure.
3674  *
3675  *      The caller must hold the rtnl_mutex.
3676  *
3677  *      For a general description of rx_handler, see enum rx_handler_result.
3678  */
3679 int netdev_rx_handler_register(struct net_device *dev,
3680                                rx_handler_func_t *rx_handler,
3681                                void *rx_handler_data)
3682 {
3683         ASSERT_RTNL();
3684
3685         if (dev->rx_handler)
3686                 return -EBUSY;
3687
3688         /* Note: rx_handler_data must be set before rx_handler */
3689         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3690         rcu_assign_pointer(dev->rx_handler, rx_handler);
3691
3692         return 0;
3693 }
3694 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3695
3696 /**
3697  *      netdev_rx_handler_unregister - unregister receive handler
3698  *      @dev: device to unregister a handler from
3699  *
3700  *      Unregister a receive handler from a device.
3701  *
3702  *      The caller must hold the rtnl_mutex.
3703  */
3704 void netdev_rx_handler_unregister(struct net_device *dev)
3705 {
3706
3707         ASSERT_RTNL();
3708         RCU_INIT_POINTER(dev->rx_handler, NULL);
3709         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3710          * section has a guarantee to see a non NULL rx_handler_data
3711          * as well.
3712          */
3713         synchronize_net();
3714         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3715 }
3716 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3717
3718 /*
3719  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3720  * the special handling of PFMEMALLOC skbs.
3721  */
3722 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3723 {
3724         switch (skb->protocol) {
3725         case htons(ETH_P_ARP):
3726         case htons(ETH_P_IP):
3727         case htons(ETH_P_IPV6):
3728         case htons(ETH_P_8021Q):
3729         case htons(ETH_P_8021AD):
3730                 return true;
3731         default:
3732                 return false;
3733         }
3734 }
3735
3736 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3737                              int *ret, struct net_device *orig_dev)
3738 {
3739 #ifdef CONFIG_NETFILTER_INGRESS
3740         if (nf_hook_ingress_active(skb)) {
3741                 if (*pt_prev) {
3742                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3743                         *pt_prev = NULL;
3744                 }
3745
3746                 return nf_hook_ingress(skb);
3747         }
3748 #endif /* CONFIG_NETFILTER_INGRESS */
3749         return 0;
3750 }
3751
3752 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3753 {
3754         struct packet_type *ptype, *pt_prev;
3755         rx_handler_func_t *rx_handler;
3756         struct net_device *orig_dev;
3757         bool deliver_exact = false;
3758         int ret = NET_RX_DROP;
3759         __be16 type;
3760
3761         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3762
3763         trace_netif_receive_skb(skb);
3764
3765         orig_dev = skb->dev;
3766
3767         skb_reset_network_header(skb);
3768         if (!skb_transport_header_was_set(skb))
3769                 skb_reset_transport_header(skb);
3770         skb_reset_mac_len(skb);
3771
3772         pt_prev = NULL;
3773
3774         rcu_read_lock();
3775
3776 another_round:
3777         skb->skb_iif = skb->dev->ifindex;
3778
3779         __this_cpu_inc(softnet_data.processed);
3780
3781         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3782             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3783                 skb = skb_vlan_untag(skb);
3784                 if (unlikely(!skb))
3785                         goto unlock;
3786         }
3787
3788 #ifdef CONFIG_NET_CLS_ACT
3789         if (skb->tc_verd & TC_NCLS) {
3790                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3791                 goto ncls;
3792         }
3793 #endif
3794
3795         if (pfmemalloc)
3796                 goto skip_taps;
3797
3798         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3799                 if (pt_prev)
3800                         ret = deliver_skb(skb, pt_prev, orig_dev);
3801                 pt_prev = ptype;
3802         }
3803
3804         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3805                 if (pt_prev)
3806                         ret = deliver_skb(skb, pt_prev, orig_dev);
3807                 pt_prev = ptype;
3808         }
3809
3810 skip_taps:
3811 #ifdef CONFIG_NET_INGRESS
3812         if (static_key_false(&ingress_needed)) {
3813                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3814                 if (!skb)
3815                         goto unlock;
3816
3817                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3818                         goto unlock;
3819         }
3820 #endif
3821 #ifdef CONFIG_NET_CLS_ACT
3822         skb->tc_verd = 0;
3823 ncls:
3824 #endif
3825         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3826                 goto drop;
3827
3828         if (skb_vlan_tag_present(skb)) {
3829                 if (pt_prev) {
3830                         ret = deliver_skb(skb, pt_prev, orig_dev);
3831                         pt_prev = NULL;
3832                 }
3833                 if (vlan_do_receive(&skb))
3834                         goto another_round;
3835                 else if (unlikely(!skb))
3836                         goto unlock;
3837         }
3838
3839         rx_handler = rcu_dereference(skb->dev->rx_handler);
3840         if (rx_handler) {
3841                 if (pt_prev) {
3842                         ret = deliver_skb(skb, pt_prev, orig_dev);
3843                         pt_prev = NULL;
3844                 }
3845                 switch (rx_handler(&skb)) {
3846                 case RX_HANDLER_CONSUMED:
3847                         ret = NET_RX_SUCCESS;
3848                         goto unlock;
3849                 case RX_HANDLER_ANOTHER:
3850                         goto another_round;
3851                 case RX_HANDLER_EXACT:
3852                         deliver_exact = true;
3853                 case RX_HANDLER_PASS:
3854                         break;
3855                 default:
3856                         BUG();
3857                 }
3858         }
3859
3860         if (unlikely(skb_vlan_tag_present(skb))) {
3861                 if (skb_vlan_tag_get_id(skb))
3862                         skb->pkt_type = PACKET_OTHERHOST;
3863                 /* Note: we might in the future use prio bits
3864                  * and set skb->priority like in vlan_do_receive()
3865                  * For the time being, just ignore Priority Code Point
3866                  */
3867                 skb->vlan_tci = 0;
3868         }
3869
3870         type = skb->protocol;
3871
3872         /* deliver only exact match when indicated */
3873         if (likely(!deliver_exact)) {
3874                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3875                                        &ptype_base[ntohs(type) &
3876                                                    PTYPE_HASH_MASK]);
3877         }
3878
3879         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3880                                &orig_dev->ptype_specific);
3881
3882         if (unlikely(skb->dev != orig_dev)) {
3883                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3884                                        &skb->dev->ptype_specific);
3885         }
3886
3887         if (pt_prev) {
3888                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3889                         goto drop;
3890                 else
3891                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3892         } else {
3893 drop:
3894                 atomic_long_inc(&skb->dev->rx_dropped);
3895                 kfree_skb(skb);
3896                 /* Jamal, now you will not able to escape explaining
3897                  * me how you were going to use this. :-)
3898                  */
3899                 ret = NET_RX_DROP;
3900         }
3901
3902 unlock:
3903         rcu_read_unlock();
3904         return ret;
3905 }
3906
3907 static int __netif_receive_skb(struct sk_buff *skb)
3908 {
3909         int ret;
3910
3911         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3912                 unsigned long pflags = current->flags;
3913
3914                 /*
3915                  * PFMEMALLOC skbs are special, they should
3916                  * - be delivered to SOCK_MEMALLOC sockets only
3917                  * - stay away from userspace
3918                  * - have bounded memory usage
3919                  *
3920                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3921                  * context down to all allocation sites.
3922                  */
3923                 current->flags |= PF_MEMALLOC;
3924                 ret = __netif_receive_skb_core(skb, true);
3925                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3926         } else
3927                 ret = __netif_receive_skb_core(skb, false);
3928
3929         return ret;
3930 }
3931
3932 static int netif_receive_skb_internal(struct sk_buff *skb)
3933 {
3934         net_timestamp_check(netdev_tstamp_prequeue, skb);
3935
3936         if (skb_defer_rx_timestamp(skb))
3937                 return NET_RX_SUCCESS;
3938
3939 #ifdef CONFIG_RPS
3940         if (static_key_false(&rps_needed)) {
3941                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3942                 int cpu, ret;
3943
3944                 rcu_read_lock();
3945
3946                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3947
3948                 if (cpu >= 0) {
3949                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3950                         rcu_read_unlock();
3951                         return ret;
3952                 }
3953                 rcu_read_unlock();
3954         }
3955 #endif
3956         return __netif_receive_skb(skb);
3957 }
3958
3959 /**
3960  *      netif_receive_skb - process receive buffer from network
3961  *      @skb: buffer to process
3962  *
3963  *      netif_receive_skb() is the main receive data processing function.
3964  *      It always succeeds. The buffer may be dropped during processing
3965  *      for congestion control or by the protocol layers.
3966  *
3967  *      This function may only be called from softirq context and interrupts
3968  *      should be enabled.
3969  *
3970  *      Return values (usually ignored):
3971  *      NET_RX_SUCCESS: no congestion
3972  *      NET_RX_DROP: packet was dropped
3973  */
3974 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
3975 {
3976         trace_netif_receive_skb_entry(skb);
3977
3978         return netif_receive_skb_internal(skb);
3979 }
3980 EXPORT_SYMBOL(netif_receive_skb_sk);
3981
3982 /* Network device is going away, flush any packets still pending
3983  * Called with irqs disabled.
3984  */
3985 static void flush_backlog(void *arg)
3986 {
3987         struct net_device *dev = arg;
3988         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3989         struct sk_buff *skb, *tmp;
3990
3991         rps_lock(sd);
3992         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3993                 if (skb->dev == dev) {
3994                         __skb_unlink(skb, &sd->input_pkt_queue);
3995                         kfree_skb(skb);
3996                         input_queue_head_incr(sd);
3997                 }
3998         }
3999         rps_unlock(sd);
4000
4001         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4002                 if (skb->dev == dev) {
4003                         __skb_unlink(skb, &sd->process_queue);
4004                         kfree_skb(skb);
4005                         input_queue_head_incr(sd);
4006                 }
4007         }
4008 }
4009
4010 static int napi_gro_complete(struct sk_buff *skb)
4011 {
4012         struct packet_offload *ptype;
4013         __be16 type = skb->protocol;
4014         struct list_head *head = &offload_base;
4015         int err = -ENOENT;
4016
4017         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4018
4019         if (NAPI_GRO_CB(skb)->count == 1) {
4020                 skb_shinfo(skb)->gso_size = 0;
4021                 goto out;
4022         }
4023
4024         rcu_read_lock();
4025         list_for_each_entry_rcu(ptype, head, list) {
4026                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4027                         continue;
4028
4029                 err = ptype->callbacks.gro_complete(skb, 0);
4030                 break;
4031         }
4032         rcu_read_unlock();
4033
4034         if (err) {
4035                 WARN_ON(&ptype->list == head);
4036                 kfree_skb(skb);
4037                 return NET_RX_SUCCESS;
4038         }
4039
4040 out:
4041         return netif_receive_skb_internal(skb);
4042 }
4043
4044 /* napi->gro_list contains packets ordered by age.
4045  * youngest packets at the head of it.
4046  * Complete skbs in reverse order to reduce latencies.
4047  */
4048 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4049 {
4050         struct sk_buff *skb, *prev = NULL;
4051
4052         /* scan list and build reverse chain */
4053         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4054                 skb->prev = prev;
4055                 prev = skb;
4056         }
4057
4058         for (skb = prev; skb; skb = prev) {
4059                 skb->next = NULL;
4060
4061                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4062                         return;
4063
4064                 prev = skb->prev;
4065                 napi_gro_complete(skb);
4066                 napi->gro_count--;
4067         }
4068
4069         napi->gro_list = NULL;
4070 }
4071 EXPORT_SYMBOL(napi_gro_flush);
4072
4073 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4074 {
4075         struct sk_buff *p;
4076         unsigned int maclen = skb->dev->hard_header_len;
4077         u32 hash = skb_get_hash_raw(skb);
4078
4079         for (p = napi->gro_list; p; p = p->next) {
4080                 unsigned long diffs;
4081
4082                 NAPI_GRO_CB(p)->flush = 0;
4083
4084                 if (hash != skb_get_hash_raw(p)) {
4085                         NAPI_GRO_CB(p)->same_flow = 0;
4086                         continue;
4087                 }
4088
4089                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4090                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4091                 if (maclen == ETH_HLEN)
4092                         diffs |= compare_ether_header(skb_mac_header(p),
4093                                                       skb_mac_header(skb));
4094                 else if (!diffs)
4095                         diffs = memcmp(skb_mac_header(p),
4096                                        skb_mac_header(skb),
4097                                        maclen);
4098                 NAPI_GRO_CB(p)->same_flow = !diffs;
4099         }
4100 }
4101
4102 static void skb_gro_reset_offset(struct sk_buff *skb)
4103 {
4104         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4105         const skb_frag_t *frag0 = &pinfo->frags[0];
4106
4107         NAPI_GRO_CB(skb)->data_offset = 0;
4108         NAPI_GRO_CB(skb)->frag0 = NULL;
4109         NAPI_GRO_CB(skb)->frag0_len = 0;
4110
4111         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4112             pinfo->nr_frags &&
4113             !PageHighMem(skb_frag_page(frag0))) {
4114                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4115                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4116         }
4117 }
4118
4119 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4120 {
4121         struct skb_shared_info *pinfo = skb_shinfo(skb);
4122
4123         BUG_ON(skb->end - skb->tail < grow);
4124
4125         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4126
4127         skb->data_len -= grow;
4128         skb->tail += grow;
4129
4130         pinfo->frags[0].page_offset += grow;
4131         skb_frag_size_sub(&pinfo->frags[0], grow);
4132
4133         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4134                 skb_frag_unref(skb, 0);
4135                 memmove(pinfo->frags, pinfo->frags + 1,
4136                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4137         }
4138 }
4139
4140 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4141 {
4142         struct sk_buff **pp = NULL;
4143         struct packet_offload *ptype;
4144         __be16 type = skb->protocol;
4145         struct list_head *head = &offload_base;
4146         int same_flow;
4147         enum gro_result ret;
4148         int grow;
4149
4150         if (!(skb->dev->features & NETIF_F_GRO))
4151                 goto normal;
4152
4153         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4154                 goto normal;
4155
4156         gro_list_prepare(napi, skb);
4157
4158         rcu_read_lock();
4159         list_for_each_entry_rcu(ptype, head, list) {
4160                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4161                         continue;
4162
4163                 skb_set_network_header(skb, skb_gro_offset(skb));
4164                 skb_reset_mac_len(skb);
4165                 NAPI_GRO_CB(skb)->same_flow = 0;
4166                 NAPI_GRO_CB(skb)->flush = 0;
4167                 NAPI_GRO_CB(skb)->free = 0;
4168                 NAPI_GRO_CB(skb)->udp_mark = 0;
4169                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4170
4171                 /* Setup for GRO checksum validation */
4172                 switch (skb->ip_summed) {
4173                 case CHECKSUM_COMPLETE:
4174                         NAPI_GRO_CB(skb)->csum = skb->csum;
4175                         NAPI_GRO_CB(skb)->csum_valid = 1;
4176                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4177                         break;
4178                 case CHECKSUM_UNNECESSARY:
4179                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4180                         NAPI_GRO_CB(skb)->csum_valid = 0;
4181                         break;
4182                 default:
4183                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4184                         NAPI_GRO_CB(skb)->csum_valid = 0;
4185                 }
4186
4187                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4188                 break;
4189         }
4190         rcu_read_unlock();
4191
4192         if (&ptype->list == head)
4193                 goto normal;
4194
4195         same_flow = NAPI_GRO_CB(skb)->same_flow;
4196         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4197
4198         if (pp) {
4199                 struct sk_buff *nskb = *pp;
4200
4201                 *pp = nskb->next;
4202                 nskb->next = NULL;
4203                 napi_gro_complete(nskb);
4204                 napi->gro_count--;
4205         }
4206
4207         if (same_flow)
4208                 goto ok;
4209
4210         if (NAPI_GRO_CB(skb)->flush)
4211                 goto normal;
4212
4213         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4214                 struct sk_buff *nskb = napi->gro_list;
4215
4216                 /* locate the end of the list to select the 'oldest' flow */
4217                 while (nskb->next) {
4218                         pp = &nskb->next;
4219                         nskb = *pp;
4220                 }
4221                 *pp = NULL;
4222                 nskb->next = NULL;
4223                 napi_gro_complete(nskb);
4224         } else {
4225                 napi->gro_count++;
4226         }
4227         NAPI_GRO_CB(skb)->count = 1;
4228         NAPI_GRO_CB(skb)->age = jiffies;
4229         NAPI_GRO_CB(skb)->last = skb;
4230         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4231         skb->next = napi->gro_list;
4232         napi->gro_list = skb;
4233         ret = GRO_HELD;
4234
4235 pull:
4236         grow = skb_gro_offset(skb) - skb_headlen(skb);
4237         if (grow > 0)
4238                 gro_pull_from_frag0(skb, grow);
4239 ok:
4240         return ret;
4241
4242 normal:
4243         ret = GRO_NORMAL;
4244         goto pull;
4245 }
4246
4247 struct packet_offload *gro_find_receive_by_type(__be16 type)
4248 {
4249         struct list_head *offload_head = &offload_base;
4250         struct packet_offload *ptype;
4251
4252         list_for_each_entry_rcu(ptype, offload_head, list) {
4253                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4254                         continue;
4255                 return ptype;
4256         }
4257         return NULL;
4258 }
4259 EXPORT_SYMBOL(gro_find_receive_by_type);
4260
4261 struct packet_offload *gro_find_complete_by_type(__be16 type)
4262 {
4263         struct list_head *offload_head = &offload_base;
4264         struct packet_offload *ptype;
4265
4266         list_for_each_entry_rcu(ptype, offload_head, list) {
4267                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4268                         continue;
4269                 return ptype;
4270         }
4271         return NULL;
4272 }
4273 EXPORT_SYMBOL(gro_find_complete_by_type);
4274
4275 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4276 {
4277         switch (ret) {
4278         case GRO_NORMAL:
4279                 if (netif_receive_skb_internal(skb))
4280                         ret = GRO_DROP;
4281                 break;
4282
4283         case GRO_DROP:
4284                 kfree_skb(skb);
4285                 break;
4286
4287         case GRO_MERGED_FREE:
4288                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4289                         kmem_cache_free(skbuff_head_cache, skb);
4290                 else
4291                         __kfree_skb(skb);
4292                 break;
4293
4294         case GRO_HELD:
4295         case GRO_MERGED:
4296                 break;
4297         }
4298
4299         return ret;
4300 }
4301
4302 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4303 {
4304         trace_napi_gro_receive_entry(skb);
4305
4306         skb_gro_reset_offset(skb);
4307
4308         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4309 }
4310 EXPORT_SYMBOL(napi_gro_receive);
4311
4312 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4313 {
4314         if (unlikely(skb->pfmemalloc)) {
4315                 consume_skb(skb);
4316                 return;
4317         }
4318         __skb_pull(skb, skb_headlen(skb));
4319         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4320         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4321         skb->vlan_tci = 0;
4322         skb->dev = napi->dev;
4323         skb->skb_iif = 0;
4324         skb->encapsulation = 0;
4325         skb_shinfo(skb)->gso_type = 0;
4326         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4327
4328         napi->skb = skb;
4329 }
4330
4331 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4332 {
4333         struct sk_buff *skb = napi->skb;
4334
4335         if (!skb) {
4336                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4337                 napi->skb = skb;
4338         }
4339         return skb;
4340 }
4341 EXPORT_SYMBOL(napi_get_frags);
4342
4343 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4344                                       struct sk_buff *skb,
4345                                       gro_result_t ret)
4346 {
4347         switch (ret) {
4348         case GRO_NORMAL:
4349         case GRO_HELD:
4350                 __skb_push(skb, ETH_HLEN);
4351                 skb->protocol = eth_type_trans(skb, skb->dev);
4352                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4353                         ret = GRO_DROP;
4354                 break;
4355
4356         case GRO_DROP:
4357         case GRO_MERGED_FREE:
4358                 napi_reuse_skb(napi, skb);
4359                 break;
4360
4361         case GRO_MERGED:
4362                 break;
4363         }
4364
4365         return ret;
4366 }
4367
4368 /* Upper GRO stack assumes network header starts at gro_offset=0
4369  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4370  * We copy ethernet header into skb->data to have a common layout.
4371  */
4372 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4373 {
4374         struct sk_buff *skb = napi->skb;
4375         const struct ethhdr *eth;
4376         unsigned int hlen = sizeof(*eth);
4377
4378         napi->skb = NULL;
4379
4380         skb_reset_mac_header(skb);
4381         skb_gro_reset_offset(skb);
4382
4383         eth = skb_gro_header_fast(skb, 0);
4384         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4385                 eth = skb_gro_header_slow(skb, hlen, 0);
4386                 if (unlikely(!eth)) {
4387                         napi_reuse_skb(napi, skb);
4388                         return NULL;
4389                 }
4390         } else {
4391                 gro_pull_from_frag0(skb, hlen);
4392                 NAPI_GRO_CB(skb)->frag0 += hlen;
4393                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4394         }
4395         __skb_pull(skb, hlen);
4396
4397         /*
4398          * This works because the only protocols we care about don't require
4399          * special handling.
4400          * We'll fix it up properly in napi_frags_finish()
4401          */
4402         skb->protocol = eth->h_proto;
4403
4404         return skb;
4405 }
4406
4407 gro_result_t napi_gro_frags(struct napi_struct *napi)
4408 {
4409         struct sk_buff *skb = napi_frags_skb(napi);
4410
4411         if (!skb)
4412                 return GRO_DROP;
4413
4414         trace_napi_gro_frags_entry(skb);
4415
4416         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4417 }
4418 EXPORT_SYMBOL(napi_gro_frags);
4419
4420 /* Compute the checksum from gro_offset and return the folded value
4421  * after adding in any pseudo checksum.
4422  */
4423 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4424 {
4425         __wsum wsum;
4426         __sum16 sum;
4427
4428         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4429
4430         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4431         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4432         if (likely(!sum)) {
4433                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4434                     !skb->csum_complete_sw)
4435                         netdev_rx_csum_fault(skb->dev);
4436         }
4437
4438         NAPI_GRO_CB(skb)->csum = wsum;
4439         NAPI_GRO_CB(skb)->csum_valid = 1;
4440
4441         return sum;
4442 }
4443 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4444
4445 /*
4446  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4447  * Note: called with local irq disabled, but exits with local irq enabled.
4448  */
4449 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4450 {
4451 #ifdef CONFIG_RPS
4452         struct softnet_data *remsd = sd->rps_ipi_list;
4453
4454         if (remsd) {
4455                 sd->rps_ipi_list = NULL;
4456
4457                 local_irq_enable();
4458
4459                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4460                 while (remsd) {
4461                         struct softnet_data *next = remsd->rps_ipi_next;
4462
4463                         if (cpu_online(remsd->cpu))
4464                                 smp_call_function_single_async(remsd->cpu,
4465                                                            &remsd->csd);
4466                         remsd = next;
4467                 }
4468         } else
4469 #endif
4470                 local_irq_enable();
4471 }
4472
4473 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4474 {
4475 #ifdef CONFIG_RPS
4476         return sd->rps_ipi_list != NULL;
4477 #else
4478         return false;
4479 #endif
4480 }
4481
4482 static int process_backlog(struct napi_struct *napi, int quota)
4483 {
4484         int work = 0;
4485         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4486
4487         /* Check if we have pending ipi, its better to send them now,
4488          * not waiting net_rx_action() end.
4489          */
4490         if (sd_has_rps_ipi_waiting(sd)) {
4491                 local_irq_disable();
4492                 net_rps_action_and_irq_enable(sd);
4493         }
4494
4495         napi->weight = weight_p;
4496         local_irq_disable();
4497         while (1) {
4498                 struct sk_buff *skb;
4499
4500                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4501                         local_irq_enable();
4502                         __netif_receive_skb(skb);
4503                         local_irq_disable();
4504                         input_queue_head_incr(sd);
4505                         if (++work >= quota) {
4506                                 local_irq_enable();
4507                                 return work;
4508                         }
4509                 }
4510
4511                 rps_lock(sd);
4512                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4513                         /*
4514                          * Inline a custom version of __napi_complete().
4515                          * only current cpu owns and manipulates this napi,
4516                          * and NAPI_STATE_SCHED is the only possible flag set
4517                          * on backlog.
4518                          * We can use a plain write instead of clear_bit(),
4519                          * and we dont need an smp_mb() memory barrier.
4520                          */
4521                         napi->state = 0;
4522                         rps_unlock(sd);
4523
4524                         break;
4525                 }
4526
4527                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4528                                            &sd->process_queue);
4529                 rps_unlock(sd);
4530         }
4531         local_irq_enable();
4532
4533         return work;
4534 }
4535
4536 /**
4537  * __napi_schedule - schedule for receive
4538  * @n: entry to schedule
4539  *
4540  * The entry's receive function will be scheduled to run.
4541  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4542  */
4543 void __napi_schedule(struct napi_struct *n)
4544 {
4545         unsigned long flags;
4546
4547         local_irq_save(flags);
4548         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4549         local_irq_restore(flags);
4550 }
4551 EXPORT_SYMBOL(__napi_schedule);
4552
4553 /**
4554  * __napi_schedule_irqoff - schedule for receive
4555  * @n: entry to schedule
4556  *
4557  * Variant of __napi_schedule() assuming hard irqs are masked
4558  */
4559 void __napi_schedule_irqoff(struct napi_struct *n)
4560 {
4561         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4562 }
4563 EXPORT_SYMBOL(__napi_schedule_irqoff);
4564
4565 void __napi_complete(struct napi_struct *n)
4566 {
4567         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4568
4569         list_del_init(&n->poll_list);
4570         smp_mb__before_atomic();
4571         clear_bit(NAPI_STATE_SCHED, &n->state);
4572 }
4573 EXPORT_SYMBOL(__napi_complete);
4574
4575 void napi_complete_done(struct napi_struct *n, int work_done)
4576 {
4577         unsigned long flags;
4578
4579         /*
4580          * don't let napi dequeue from the cpu poll list
4581          * just in case its running on a different cpu
4582          */
4583         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4584                 return;
4585
4586         if (n->gro_list) {
4587                 unsigned long timeout = 0;
4588
4589                 if (work_done)
4590                         timeout = n->dev->gro_flush_timeout;
4591
4592                 if (timeout)
4593                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4594                                       HRTIMER_MODE_REL_PINNED);
4595                 else
4596                         napi_gro_flush(n, false);
4597         }
4598         if (likely(list_empty(&n->poll_list))) {
4599                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4600         } else {
4601                 /* If n->poll_list is not empty, we need to mask irqs */
4602                 local_irq_save(flags);
4603                 __napi_complete(n);
4604                 local_irq_restore(flags);
4605         }
4606 }
4607 EXPORT_SYMBOL(napi_complete_done);
4608
4609 /* must be called under rcu_read_lock(), as we dont take a reference */
4610 struct napi_struct *napi_by_id(unsigned int napi_id)
4611 {
4612         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4613         struct napi_struct *napi;
4614
4615         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4616                 if (napi->napi_id == napi_id)
4617                         return napi;
4618
4619         return NULL;
4620 }
4621 EXPORT_SYMBOL_GPL(napi_by_id);
4622
4623 void napi_hash_add(struct napi_struct *napi)
4624 {
4625         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4626
4627                 spin_lock(&napi_hash_lock);
4628
4629                 /* 0 is not a valid id, we also skip an id that is taken
4630                  * we expect both events to be extremely rare
4631                  */
4632                 napi->napi_id = 0;
4633                 while (!napi->napi_id) {
4634                         napi->napi_id = ++napi_gen_id;
4635                         if (napi_by_id(napi->napi_id))
4636                                 napi->napi_id = 0;
4637                 }
4638
4639                 hlist_add_head_rcu(&napi->napi_hash_node,
4640                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4641
4642                 spin_unlock(&napi_hash_lock);
4643         }
4644 }
4645 EXPORT_SYMBOL_GPL(napi_hash_add);
4646
4647 /* Warning : caller is responsible to make sure rcu grace period
4648  * is respected before freeing memory containing @napi
4649  */
4650 void napi_hash_del(struct napi_struct *napi)
4651 {
4652         spin_lock(&napi_hash_lock);
4653
4654         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4655                 hlist_del_rcu(&napi->napi_hash_node);
4656
4657         spin_unlock(&napi_hash_lock);
4658 }
4659 EXPORT_SYMBOL_GPL(napi_hash_del);
4660
4661 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4662 {
4663         struct napi_struct *napi;
4664
4665         napi = container_of(timer, struct napi_struct, timer);
4666         if (napi->gro_list)
4667                 napi_schedule(napi);
4668
4669         return HRTIMER_NORESTART;
4670 }
4671
4672 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4673                     int (*poll)(struct napi_struct *, int), int weight)
4674 {
4675         INIT_LIST_HEAD(&napi->poll_list);
4676         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4677         napi->timer.function = napi_watchdog;
4678         napi->gro_count = 0;
4679         napi->gro_list = NULL;
4680         napi->skb = NULL;
4681         napi->poll = poll;
4682         if (weight > NAPI_POLL_WEIGHT)
4683                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4684                             weight, dev->name);
4685         napi->weight = weight;
4686         list_add(&napi->dev_list, &dev->napi_list);
4687         napi->dev = dev;
4688 #ifdef CONFIG_NETPOLL
4689         spin_lock_init(&napi->poll_lock);
4690         napi->poll_owner = -1;
4691 #endif
4692         set_bit(NAPI_STATE_SCHED, &napi->state);
4693 }
4694 EXPORT_SYMBOL(netif_napi_add);
4695
4696 void napi_disable(struct napi_struct *n)
4697 {
4698         might_sleep();
4699         set_bit(NAPI_STATE_DISABLE, &n->state);
4700
4701         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4702                 msleep(1);
4703
4704         hrtimer_cancel(&n->timer);
4705
4706         clear_bit(NAPI_STATE_DISABLE, &n->state);
4707 }
4708 EXPORT_SYMBOL(napi_disable);
4709
4710 void netif_napi_del(struct napi_struct *napi)
4711 {
4712         list_del_init(&napi->dev_list);
4713         napi_free_frags(napi);
4714
4715         kfree_skb_list(napi->gro_list);
4716         napi->gro_list = NULL;
4717         napi->gro_count = 0;
4718 }
4719 EXPORT_SYMBOL(netif_napi_del);
4720
4721 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4722 {
4723         void *have;
4724         int work, weight;
4725
4726         list_del_init(&n->poll_list);
4727
4728         have = netpoll_poll_lock(n);
4729
4730         weight = n->weight;
4731
4732         /* This NAPI_STATE_SCHED test is for avoiding a race
4733          * with netpoll's poll_napi().  Only the entity which
4734          * obtains the lock and sees NAPI_STATE_SCHED set will
4735          * actually make the ->poll() call.  Therefore we avoid
4736          * accidentally calling ->poll() when NAPI is not scheduled.
4737          */
4738         work = 0;
4739         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4740                 work = n->poll(n, weight);
4741                 trace_napi_poll(n);
4742         }
4743
4744         WARN_ON_ONCE(work > weight);
4745
4746         if (likely(work < weight))
4747                 goto out_unlock;
4748
4749         /* Drivers must not modify the NAPI state if they
4750          * consume the entire weight.  In such cases this code
4751          * still "owns" the NAPI instance and therefore can
4752          * move the instance around on the list at-will.
4753          */
4754         if (unlikely(napi_disable_pending(n))) {
4755                 napi_complete(n);
4756                 goto out_unlock;
4757         }
4758
4759         if (n->gro_list) {
4760                 /* flush too old packets
4761                  * If HZ < 1000, flush all packets.
4762                  */
4763                 napi_gro_flush(n, HZ >= 1000);
4764         }
4765
4766         /* Some drivers may have called napi_schedule
4767          * prior to exhausting their budget.
4768          */
4769         if (unlikely(!list_empty(&n->poll_list))) {
4770                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4771                              n->dev ? n->dev->name : "backlog");
4772                 goto out_unlock;
4773         }
4774
4775         list_add_tail(&n->poll_list, repoll);
4776
4777 out_unlock:
4778         netpoll_poll_unlock(have);
4779
4780         return work;
4781 }
4782
4783 static void net_rx_action(struct softirq_action *h)
4784 {
4785         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4786         unsigned long time_limit = jiffies + 2;
4787         int budget = netdev_budget;
4788         LIST_HEAD(list);
4789         LIST_HEAD(repoll);
4790
4791         local_irq_disable();
4792         list_splice_init(&sd->poll_list, &list);
4793         local_irq_enable();
4794
4795         for (;;) {
4796                 struct napi_struct *n;
4797
4798                 if (list_empty(&list)) {
4799                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4800                                 return;
4801                         break;
4802                 }
4803
4804                 n = list_first_entry(&list, struct napi_struct, poll_list);
4805                 budget -= napi_poll(n, &repoll);
4806
4807                 /* If softirq window is exhausted then punt.
4808                  * Allow this to run for 2 jiffies since which will allow
4809                  * an average latency of 1.5/HZ.
4810                  */
4811                 if (unlikely(budget <= 0 ||
4812                              time_after_eq(jiffies, time_limit))) {
4813                         sd->time_squeeze++;
4814                         break;
4815                 }
4816         }
4817
4818         local_irq_disable();
4819
4820         list_splice_tail_init(&sd->poll_list, &list);
4821         list_splice_tail(&repoll, &list);
4822         list_splice(&list, &sd->poll_list);
4823         if (!list_empty(&sd->poll_list))
4824                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4825
4826         net_rps_action_and_irq_enable(sd);
4827 }
4828
4829 struct netdev_adjacent {
4830         struct net_device *dev;
4831
4832         /* upper master flag, there can only be one master device per list */
4833         bool master;
4834
4835         /* counter for the number of times this device was added to us */
4836         u16 ref_nr;
4837
4838         /* private field for the users */
4839         void *private;
4840
4841         struct list_head list;
4842         struct rcu_head rcu;
4843 };
4844
4845 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4846                                                  struct net_device *adj_dev,
4847                                                  struct list_head *adj_list)
4848 {
4849         struct netdev_adjacent *adj;
4850
4851         list_for_each_entry(adj, adj_list, list) {
4852                 if (adj->dev == adj_dev)
4853                         return adj;
4854         }
4855         return NULL;
4856 }
4857
4858 /**
4859  * netdev_has_upper_dev - Check if device is linked to an upper device
4860  * @dev: device
4861  * @upper_dev: upper device to check
4862  *
4863  * Find out if a device is linked to specified upper device and return true
4864  * in case it is. Note that this checks only immediate upper device,
4865  * not through a complete stack of devices. The caller must hold the RTNL lock.
4866  */
4867 bool netdev_has_upper_dev(struct net_device *dev,
4868                           struct net_device *upper_dev)
4869 {
4870         ASSERT_RTNL();
4871
4872         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4873 }
4874 EXPORT_SYMBOL(netdev_has_upper_dev);
4875
4876 /**
4877  * netdev_has_any_upper_dev - Check if device is linked to some device
4878  * @dev: device
4879  *
4880  * Find out if a device is linked to an upper device and return true in case
4881  * it is. The caller must hold the RTNL lock.
4882  */
4883 static bool netdev_has_any_upper_dev(struct net_device *dev)
4884 {
4885         ASSERT_RTNL();
4886
4887         return !list_empty(&dev->all_adj_list.upper);
4888 }
4889
4890 /**
4891  * netdev_master_upper_dev_get - Get master upper device
4892  * @dev: device
4893  *
4894  * Find a master upper device and return pointer to it or NULL in case
4895  * it's not there. The caller must hold the RTNL lock.
4896  */
4897 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4898 {
4899         struct netdev_adjacent *upper;
4900
4901         ASSERT_RTNL();
4902
4903         if (list_empty(&dev->adj_list.upper))
4904                 return NULL;
4905
4906         upper = list_first_entry(&dev->adj_list.upper,
4907                                  struct netdev_adjacent, list);
4908         if (likely(upper->master))
4909                 return upper->dev;
4910         return NULL;
4911 }
4912 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4913
4914 void *netdev_adjacent_get_private(struct list_head *adj_list)
4915 {
4916         struct netdev_adjacent *adj;
4917
4918         adj = list_entry(adj_list, struct netdev_adjacent, list);
4919
4920         return adj->private;
4921 }
4922 EXPORT_SYMBOL(netdev_adjacent_get_private);
4923
4924 /**
4925  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4926  * @dev: device
4927  * @iter: list_head ** of the current position
4928  *
4929  * Gets the next device from the dev's upper list, starting from iter
4930  * position. The caller must hold RCU read lock.
4931  */
4932 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4933                                                  struct list_head **iter)
4934 {
4935         struct netdev_adjacent *upper;
4936
4937         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4938
4939         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4940
4941         if (&upper->list == &dev->adj_list.upper)
4942                 return NULL;
4943
4944         *iter = &upper->list;
4945
4946         return upper->dev;
4947 }
4948 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4949
4950 /**
4951  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4952  * @dev: device
4953  * @iter: list_head ** of the current position
4954  *
4955  * Gets the next device from the dev's upper list, starting from iter
4956  * position. The caller must hold RCU read lock.
4957  */
4958 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4959                                                      struct list_head **iter)
4960 {
4961         struct netdev_adjacent *upper;
4962
4963         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4964
4965         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4966
4967         if (&upper->list == &dev->all_adj_list.upper)
4968                 return NULL;
4969
4970         *iter = &upper->list;
4971
4972         return upper->dev;
4973 }
4974 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4975
4976 /**
4977  * netdev_lower_get_next_private - Get the next ->private from the
4978  *                                 lower neighbour list
4979  * @dev: device
4980  * @iter: list_head ** of the current position
4981  *
4982  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4983  * list, starting from iter position. The caller must hold either hold the
4984  * RTNL lock or its own locking that guarantees that the neighbour lower
4985  * list will remain unchainged.
4986  */
4987 void *netdev_lower_get_next_private(struct net_device *dev,
4988                                     struct list_head **iter)
4989 {
4990         struct netdev_adjacent *lower;
4991
4992         lower = list_entry(*iter, struct netdev_adjacent, list);
4993
4994         if (&lower->list == &dev->adj_list.lower)
4995                 return NULL;
4996
4997         *iter = lower->list.next;
4998
4999         return lower->private;
5000 }
5001 EXPORT_SYMBOL(netdev_lower_get_next_private);
5002
5003 /**
5004  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5005  *                                     lower neighbour list, RCU
5006  *                                     variant
5007  * @dev: device
5008  * @iter: list_head ** of the current position
5009  *
5010  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5011  * list, starting from iter position. The caller must hold RCU read lock.
5012  */
5013 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5014                                         struct list_head **iter)
5015 {
5016         struct netdev_adjacent *lower;
5017
5018         WARN_ON_ONCE(!rcu_read_lock_held());
5019
5020         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5021
5022         if (&lower->list == &dev->adj_list.lower)
5023                 return NULL;
5024
5025         *iter = &lower->list;
5026
5027         return lower->private;
5028 }
5029 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5030
5031 /**
5032  * netdev_lower_get_next - Get the next device from the lower neighbour
5033  *                         list
5034  * @dev: device
5035  * @iter: list_head ** of the current position
5036  *
5037  * Gets the next netdev_adjacent from the dev's lower neighbour
5038  * list, starting from iter position. The caller must hold RTNL lock or
5039  * its own locking that guarantees that the neighbour lower
5040  * list will remain unchainged.
5041  */
5042 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5043 {
5044         struct netdev_adjacent *lower;
5045
5046         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5047
5048         if (&lower->list == &dev->adj_list.lower)
5049                 return NULL;
5050
5051         *iter = &lower->list;
5052
5053         return lower->dev;
5054 }
5055 EXPORT_SYMBOL(netdev_lower_get_next);
5056
5057 /**
5058  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5059  *                                     lower neighbour list, RCU
5060  *                                     variant
5061  * @dev: device
5062  *
5063  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5064  * list. The caller must hold RCU read lock.
5065  */
5066 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5067 {
5068         struct netdev_adjacent *lower;
5069
5070         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5071                         struct netdev_adjacent, list);
5072         if (lower)
5073                 return lower->private;
5074         return NULL;
5075 }
5076 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5077
5078 /**
5079  * netdev_master_upper_dev_get_rcu - Get master upper device
5080  * @dev: device
5081  *
5082  * Find a master upper device and return pointer to it or NULL in case
5083  * it's not there. The caller must hold the RCU read lock.
5084  */
5085 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5086 {
5087         struct netdev_adjacent *upper;
5088
5089         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5090                                        struct netdev_adjacent, list);
5091         if (upper && likely(upper->master))
5092                 return upper->dev;
5093         return NULL;
5094 }
5095 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5096
5097 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5098                               struct net_device *adj_dev,
5099                               struct list_head *dev_list)
5100 {
5101         char linkname[IFNAMSIZ+7];
5102         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5103                 "upper_%s" : "lower_%s", adj_dev->name);
5104         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5105                                  linkname);
5106 }
5107 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5108                                char *name,
5109                                struct list_head *dev_list)
5110 {
5111         char linkname[IFNAMSIZ+7];
5112         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5113                 "upper_%s" : "lower_%s", name);
5114         sysfs_remove_link(&(dev->dev.kobj), linkname);
5115 }
5116
5117 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5118                                                  struct net_device *adj_dev,
5119                                                  struct list_head *dev_list)
5120 {
5121         return (dev_list == &dev->adj_list.upper ||
5122                 dev_list == &dev->adj_list.lower) &&
5123                 net_eq(dev_net(dev), dev_net(adj_dev));
5124 }
5125
5126 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5127                                         struct net_device *adj_dev,
5128                                         struct list_head *dev_list,
5129                                         void *private, bool master)
5130 {
5131         struct netdev_adjacent *adj;
5132         int ret;
5133
5134         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5135
5136         if (adj) {
5137                 adj->ref_nr++;
5138                 return 0;
5139         }
5140
5141         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5142         if (!adj)
5143                 return -ENOMEM;
5144
5145         adj->dev = adj_dev;
5146         adj->master = master;
5147         adj->ref_nr = 1;
5148         adj->private = private;
5149         dev_hold(adj_dev);
5150
5151         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5152                  adj_dev->name, dev->name, adj_dev->name);
5153
5154         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5155                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5156                 if (ret)
5157                         goto free_adj;
5158         }
5159
5160         /* Ensure that master link is always the first item in list. */
5161         if (master) {
5162                 ret = sysfs_create_link(&(dev->dev.kobj),
5163                                         &(adj_dev->dev.kobj), "master");
5164                 if (ret)
5165                         goto remove_symlinks;
5166
5167                 list_add_rcu(&adj->list, dev_list);
5168         } else {
5169                 list_add_tail_rcu(&adj->list, dev_list);
5170         }
5171
5172         return 0;
5173
5174 remove_symlinks:
5175         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5176                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5177 free_adj:
5178         kfree(adj);
5179         dev_put(adj_dev);
5180
5181         return ret;
5182 }
5183
5184 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5185                                          struct net_device *adj_dev,
5186                                          struct list_head *dev_list)
5187 {
5188         struct netdev_adjacent *adj;
5189
5190         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5191
5192         if (!adj) {
5193                 pr_err("tried to remove device %s from %s\n",
5194                        dev->name, adj_dev->name);
5195                 BUG();
5196         }
5197
5198         if (adj->ref_nr > 1) {
5199                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5200                          adj->ref_nr-1);
5201                 adj->ref_nr--;
5202                 return;
5203         }
5204
5205         if (adj->master)
5206                 sysfs_remove_link(&(dev->dev.kobj), "master");
5207
5208         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5209                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5210
5211         list_del_rcu(&adj->list);
5212         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5213                  adj_dev->name, dev->name, adj_dev->name);
5214         dev_put(adj_dev);
5215         kfree_rcu(adj, rcu);
5216 }
5217
5218 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5219                                             struct net_device *upper_dev,
5220                                             struct list_head *up_list,
5221                                             struct list_head *down_list,
5222                                             void *private, bool master)
5223 {
5224         int ret;
5225
5226         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5227                                            master);
5228         if (ret)
5229                 return ret;
5230
5231         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5232                                            false);
5233         if (ret) {
5234                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5235                 return ret;
5236         }
5237
5238         return 0;
5239 }
5240
5241 static int __netdev_adjacent_dev_link(struct net_device *dev,
5242                                       struct net_device *upper_dev)
5243 {
5244         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5245                                                 &dev->all_adj_list.upper,
5246                                                 &upper_dev->all_adj_list.lower,
5247                                                 NULL, false);
5248 }
5249
5250 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5251                                                struct net_device *upper_dev,
5252                                                struct list_head *up_list,
5253                                                struct list_head *down_list)
5254 {
5255         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5256         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5257 }
5258
5259 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5260                                          struct net_device *upper_dev)
5261 {
5262         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5263                                            &dev->all_adj_list.upper,
5264                                            &upper_dev->all_adj_list.lower);
5265 }
5266
5267 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5268                                                 struct net_device *upper_dev,
5269                                                 void *private, bool master)
5270 {
5271         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5272
5273         if (ret)
5274                 return ret;
5275
5276         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5277                                                &dev->adj_list.upper,
5278                                                &upper_dev->adj_list.lower,
5279                                                private, master);
5280         if (ret) {
5281                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5282                 return ret;
5283         }
5284
5285         return 0;
5286 }
5287
5288 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5289                                                    struct net_device *upper_dev)
5290 {
5291         __netdev_adjacent_dev_unlink(dev, upper_dev);
5292         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5293                                            &dev->adj_list.upper,
5294                                            &upper_dev->adj_list.lower);
5295 }
5296
5297 static int __netdev_upper_dev_link(struct net_device *dev,
5298                                    struct net_device *upper_dev, bool master,
5299                                    void *private)
5300 {
5301         struct netdev_adjacent *i, *j, *to_i, *to_j;
5302         int ret = 0;
5303
5304         ASSERT_RTNL();
5305
5306         if (dev == upper_dev)
5307                 return -EBUSY;
5308
5309         /* To prevent loops, check if dev is not upper device to upper_dev. */
5310         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5311                 return -EBUSY;
5312
5313         if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5314                 return -EEXIST;
5315
5316         if (master && netdev_master_upper_dev_get(dev))
5317                 return -EBUSY;
5318
5319         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5320                                                    master);
5321         if (ret)
5322                 return ret;
5323
5324         /* Now that we linked these devs, make all the upper_dev's
5325          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5326          * versa, and don't forget the devices itself. All of these
5327          * links are non-neighbours.
5328          */
5329         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5330                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5331                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5332                                  i->dev->name, j->dev->name);
5333                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5334                         if (ret)
5335                                 goto rollback_mesh;
5336                 }
5337         }
5338
5339         /* add dev to every upper_dev's upper device */
5340         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5341                 pr_debug("linking %s's upper device %s with %s\n",
5342                          upper_dev->name, i->dev->name, dev->name);
5343                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5344                 if (ret)
5345                         goto rollback_upper_mesh;
5346         }
5347
5348         /* add upper_dev to every dev's lower device */
5349         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5350                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5351                          i->dev->name, upper_dev->name);
5352                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5353                 if (ret)
5354                         goto rollback_lower_mesh;
5355         }
5356
5357         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5358         return 0;
5359
5360 rollback_lower_mesh:
5361         to_i = i;
5362         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5363                 if (i == to_i)
5364                         break;
5365                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5366         }
5367
5368         i = NULL;
5369
5370 rollback_upper_mesh:
5371         to_i = i;
5372         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5373                 if (i == to_i)
5374                         break;
5375                 __netdev_adjacent_dev_unlink(dev, i->dev);
5376         }
5377
5378         i = j = NULL;
5379
5380 rollback_mesh:
5381         to_i = i;
5382         to_j = j;
5383         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5384                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5385                         if (i == to_i && j == to_j)
5386                                 break;
5387                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5388                 }
5389                 if (i == to_i)
5390                         break;
5391         }
5392
5393         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5394
5395         return ret;
5396 }
5397
5398 /**
5399  * netdev_upper_dev_link - Add a link to the upper device
5400  * @dev: device
5401  * @upper_dev: new upper device
5402  *
5403  * Adds a link to device which is upper to this one. The caller must hold
5404  * the RTNL lock. On a failure a negative errno code is returned.
5405  * On success the reference counts are adjusted and the function
5406  * returns zero.
5407  */
5408 int netdev_upper_dev_link(struct net_device *dev,
5409                           struct net_device *upper_dev)
5410 {
5411         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5412 }
5413 EXPORT_SYMBOL(netdev_upper_dev_link);
5414
5415 /**
5416  * netdev_master_upper_dev_link - Add a master link to the upper device
5417  * @dev: device
5418  * @upper_dev: new upper device
5419  *
5420  * Adds a link to device which is upper to this one. In this case, only
5421  * one master upper device can be linked, although other non-master devices
5422  * might be linked as well. The caller must hold the RTNL lock.
5423  * On a failure a negative errno code is returned. On success the reference
5424  * counts are adjusted and the function returns zero.
5425  */
5426 int netdev_master_upper_dev_link(struct net_device *dev,
5427                                  struct net_device *upper_dev)
5428 {
5429         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5430 }
5431 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5432
5433 int netdev_master_upper_dev_link_private(struct net_device *dev,
5434                                          struct net_device *upper_dev,
5435                                          void *private)
5436 {
5437         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5438 }
5439 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5440
5441 /**
5442  * netdev_upper_dev_unlink - Removes a link to upper device
5443  * @dev: device
5444  * @upper_dev: new upper device
5445  *
5446  * Removes a link to device which is upper to this one. The caller must hold
5447  * the RTNL lock.
5448  */
5449 void netdev_upper_dev_unlink(struct net_device *dev,
5450                              struct net_device *upper_dev)
5451 {
5452         struct netdev_adjacent *i, *j;
5453         ASSERT_RTNL();
5454
5455         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5456
5457         /* Here is the tricky part. We must remove all dev's lower
5458          * devices from all upper_dev's upper devices and vice
5459          * versa, to maintain the graph relationship.
5460          */
5461         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5462                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5463                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5464
5465         /* remove also the devices itself from lower/upper device
5466          * list
5467          */
5468         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5469                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5470
5471         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5472                 __netdev_adjacent_dev_unlink(dev, i->dev);
5473
5474         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5475 }
5476 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5477
5478 /**
5479  * netdev_bonding_info_change - Dispatch event about slave change
5480  * @dev: device
5481  * @bonding_info: info to dispatch
5482  *
5483  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5484  * The caller must hold the RTNL lock.
5485  */
5486 void netdev_bonding_info_change(struct net_device *dev,
5487                                 struct netdev_bonding_info *bonding_info)
5488 {
5489         struct netdev_notifier_bonding_info     info;
5490
5491         memcpy(&info.bonding_info, bonding_info,
5492                sizeof(struct netdev_bonding_info));
5493         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5494                                       &info.info);
5495 }
5496 EXPORT_SYMBOL(netdev_bonding_info_change);
5497
5498 static void netdev_adjacent_add_links(struct net_device *dev)
5499 {
5500         struct netdev_adjacent *iter;
5501
5502         struct net *net = dev_net(dev);
5503
5504         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5505                 if (!net_eq(net,dev_net(iter->dev)))
5506                         continue;
5507                 netdev_adjacent_sysfs_add(iter->dev, dev,
5508                                           &iter->dev->adj_list.lower);
5509                 netdev_adjacent_sysfs_add(dev, iter->dev,
5510                                           &dev->adj_list.upper);
5511         }
5512
5513         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5514                 if (!net_eq(net,dev_net(iter->dev)))
5515                         continue;
5516                 netdev_adjacent_sysfs_add(iter->dev, dev,
5517                                           &iter->dev->adj_list.upper);
5518                 netdev_adjacent_sysfs_add(dev, iter->dev,
5519                                           &dev->adj_list.lower);
5520         }
5521 }
5522
5523 static void netdev_adjacent_del_links(struct net_device *dev)
5524 {
5525         struct netdev_adjacent *iter;
5526
5527         struct net *net = dev_net(dev);
5528
5529         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5530                 if (!net_eq(net,dev_net(iter->dev)))
5531                         continue;
5532                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5533                                           &iter->dev->adj_list.lower);
5534                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5535                                           &dev->adj_list.upper);
5536         }
5537
5538         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5539                 if (!net_eq(net,dev_net(iter->dev)))
5540                         continue;
5541                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5542                                           &iter->dev->adj_list.upper);
5543                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5544                                           &dev->adj_list.lower);
5545         }
5546 }
5547
5548 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5549 {
5550         struct netdev_adjacent *iter;
5551
5552         struct net *net = dev_net(dev);
5553
5554         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5555                 if (!net_eq(net,dev_net(iter->dev)))
5556                         continue;
5557                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5558                                           &iter->dev->adj_list.lower);
5559                 netdev_adjacent_sysfs_add(iter->dev, dev,
5560                                           &iter->dev->adj_list.lower);
5561         }
5562
5563         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5564                 if (!net_eq(net,dev_net(iter->dev)))
5565                         continue;
5566                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5567                                           &iter->dev->adj_list.upper);
5568                 netdev_adjacent_sysfs_add(iter->dev, dev,
5569                                           &iter->dev->adj_list.upper);
5570         }
5571 }
5572
5573 void *netdev_lower_dev_get_private(struct net_device *dev,
5574                                    struct net_device *lower_dev)
5575 {
5576         struct netdev_adjacent *lower;
5577
5578         if (!lower_dev)
5579                 return NULL;
5580         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5581         if (!lower)
5582                 return NULL;
5583
5584         return lower->private;
5585 }
5586 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5587
5588
5589 int dev_get_nest_level(struct net_device *dev,
5590                        bool (*type_check)(struct net_device *dev))
5591 {
5592         struct net_device *lower = NULL;
5593         struct list_head *iter;
5594         int max_nest = -1;
5595         int nest;
5596
5597         ASSERT_RTNL();
5598
5599         netdev_for_each_lower_dev(dev, lower, iter) {
5600                 nest = dev_get_nest_level(lower, type_check);
5601                 if (max_nest < nest)
5602                         max_nest = nest;
5603         }
5604
5605         if (type_check(dev))
5606                 max_nest++;
5607
5608         return max_nest;
5609 }
5610 EXPORT_SYMBOL(dev_get_nest_level);
5611
5612 static void dev_change_rx_flags(struct net_device *dev, int flags)
5613 {
5614         const struct net_device_ops *ops = dev->netdev_ops;
5615
5616         if (ops->ndo_change_rx_flags)
5617                 ops->ndo_change_rx_flags(dev, flags);
5618 }
5619
5620 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5621 {
5622         unsigned int old_flags = dev->flags;
5623         kuid_t uid;
5624         kgid_t gid;
5625
5626         ASSERT_RTNL();
5627
5628         dev->flags |= IFF_PROMISC;
5629         dev->promiscuity += inc;
5630         if (dev->promiscuity == 0) {
5631                 /*
5632                  * Avoid overflow.
5633                  * If inc causes overflow, untouch promisc and return error.
5634                  */
5635                 if (inc < 0)
5636                         dev->flags &= ~IFF_PROMISC;
5637                 else {
5638                         dev->promiscuity -= inc;
5639                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5640                                 dev->name);
5641                         return -EOVERFLOW;
5642                 }
5643         }
5644         if (dev->flags != old_flags) {
5645                 pr_info("device %s %s promiscuous mode\n",
5646                         dev->name,
5647                         dev->flags & IFF_PROMISC ? "entered" : "left");
5648                 if (audit_enabled) {
5649                         current_uid_gid(&uid, &gid);
5650                         audit_log(current->audit_context, GFP_ATOMIC,
5651                                 AUDIT_ANOM_PROMISCUOUS,
5652                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5653                                 dev->name, (dev->flags & IFF_PROMISC),
5654                                 (old_flags & IFF_PROMISC),
5655                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5656                                 from_kuid(&init_user_ns, uid),
5657                                 from_kgid(&init_user_ns, gid),
5658                                 audit_get_sessionid(current));
5659                 }
5660
5661                 dev_change_rx_flags(dev, IFF_PROMISC);
5662         }
5663         if (notify)
5664                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5665         return 0;
5666 }
5667
5668 /**
5669  *      dev_set_promiscuity     - update promiscuity count on a device
5670  *      @dev: device
5671  *      @inc: modifier
5672  *
5673  *      Add or remove promiscuity from a device. While the count in the device
5674  *      remains above zero the interface remains promiscuous. Once it hits zero
5675  *      the device reverts back to normal filtering operation. A negative inc
5676  *      value is used to drop promiscuity on the device.
5677  *      Return 0 if successful or a negative errno code on error.
5678  */
5679 int dev_set_promiscuity(struct net_device *dev, int inc)
5680 {
5681         unsigned int old_flags = dev->flags;
5682         int err;
5683
5684         err = __dev_set_promiscuity(dev, inc, true);
5685         if (err < 0)
5686                 return err;
5687         if (dev->flags != old_flags)
5688                 dev_set_rx_mode(dev);
5689         return err;
5690 }
5691 EXPORT_SYMBOL(dev_set_promiscuity);
5692
5693 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5694 {
5695         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5696
5697         ASSERT_RTNL();
5698
5699         dev->flags |= IFF_ALLMULTI;
5700         dev->allmulti += inc;
5701         if (dev->allmulti == 0) {
5702                 /*
5703                  * Avoid overflow.
5704                  * If inc causes overflow, untouch allmulti and return error.
5705                  */
5706                 if (inc < 0)
5707                         dev->flags &= ~IFF_ALLMULTI;
5708                 else {
5709                         dev->allmulti -= inc;
5710                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5711                                 dev->name);
5712                         return -EOVERFLOW;
5713                 }
5714         }
5715         if (dev->flags ^ old_flags) {
5716                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5717                 dev_set_rx_mode(dev);
5718                 if (notify)
5719                         __dev_notify_flags(dev, old_flags,
5720                                            dev->gflags ^ old_gflags);
5721         }
5722         return 0;
5723 }
5724
5725 /**
5726  *      dev_set_allmulti        - update allmulti count on a device
5727  *      @dev: device
5728  *      @inc: modifier
5729  *
5730  *      Add or remove reception of all multicast frames to a device. While the
5731  *      count in the device remains above zero the interface remains listening
5732  *      to all interfaces. Once it hits zero the device reverts back to normal
5733  *      filtering operation. A negative @inc value is used to drop the counter
5734  *      when releasing a resource needing all multicasts.
5735  *      Return 0 if successful or a negative errno code on error.
5736  */
5737
5738 int dev_set_allmulti(struct net_device *dev, int inc)
5739 {
5740         return __dev_set_allmulti(dev, inc, true);
5741 }
5742 EXPORT_SYMBOL(dev_set_allmulti);
5743
5744 /*
5745  *      Upload unicast and multicast address lists to device and
5746  *      configure RX filtering. When the device doesn't support unicast
5747  *      filtering it is put in promiscuous mode while unicast addresses
5748  *      are present.
5749  */
5750 void __dev_set_rx_mode(struct net_device *dev)
5751 {
5752         const struct net_device_ops *ops = dev->netdev_ops;
5753
5754         /* dev_open will call this function so the list will stay sane. */
5755         if (!(dev->flags&IFF_UP))
5756                 return;
5757
5758         if (!netif_device_present(dev))
5759                 return;
5760
5761         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5762                 /* Unicast addresses changes may only happen under the rtnl,
5763                  * therefore calling __dev_set_promiscuity here is safe.
5764                  */
5765                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5766                         __dev_set_promiscuity(dev, 1, false);
5767                         dev->uc_promisc = true;
5768                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5769                         __dev_set_promiscuity(dev, -1, false);
5770                         dev->uc_promisc = false;
5771                 }
5772         }
5773
5774         if (ops->ndo_set_rx_mode)
5775                 ops->ndo_set_rx_mode(dev);
5776 }
5777
5778 void dev_set_rx_mode(struct net_device *dev)
5779 {
5780         netif_addr_lock_bh(dev);
5781         __dev_set_rx_mode(dev);
5782         netif_addr_unlock_bh(dev);
5783 }
5784
5785 /**
5786  *      dev_get_flags - get flags reported to userspace
5787  *      @dev: device
5788  *
5789  *      Get the combination of flag bits exported through APIs to userspace.
5790  */
5791 unsigned int dev_get_flags(const struct net_device *dev)
5792 {
5793         unsigned int flags;
5794
5795         flags = (dev->flags & ~(IFF_PROMISC |
5796                                 IFF_ALLMULTI |
5797                                 IFF_RUNNING |
5798                                 IFF_LOWER_UP |
5799                                 IFF_DORMANT)) |
5800                 (dev->gflags & (IFF_PROMISC |
5801                                 IFF_ALLMULTI));
5802
5803         if (netif_running(dev)) {
5804                 if (netif_oper_up(dev))
5805                         flags |= IFF_RUNNING;
5806                 if (netif_carrier_ok(dev))
5807                         flags |= IFF_LOWER_UP;
5808                 if (netif_dormant(dev))
5809                         flags |= IFF_DORMANT;
5810         }
5811
5812         return flags;
5813 }
5814 EXPORT_SYMBOL(dev_get_flags);
5815
5816 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5817 {
5818         unsigned int old_flags = dev->flags;
5819         int ret;
5820
5821         ASSERT_RTNL();
5822
5823         /*
5824          *      Set the flags on our device.
5825          */
5826
5827         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5828                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5829                                IFF_AUTOMEDIA)) |
5830                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5831                                     IFF_ALLMULTI));
5832
5833         /*
5834          *      Load in the correct multicast list now the flags have changed.
5835          */
5836
5837         if ((old_flags ^ flags) & IFF_MULTICAST)
5838                 dev_change_rx_flags(dev, IFF_MULTICAST);
5839
5840         dev_set_rx_mode(dev);
5841
5842         /*
5843          *      Have we downed the interface. We handle IFF_UP ourselves
5844          *      according to user attempts to set it, rather than blindly
5845          *      setting it.
5846          */
5847
5848         ret = 0;
5849         if ((old_flags ^ flags) & IFF_UP)
5850                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5851
5852         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5853                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5854                 unsigned int old_flags = dev->flags;
5855
5856                 dev->gflags ^= IFF_PROMISC;
5857
5858                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5859                         if (dev->flags != old_flags)
5860                                 dev_set_rx_mode(dev);
5861         }
5862
5863         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5864            is important. Some (broken) drivers set IFF_PROMISC, when
5865            IFF_ALLMULTI is requested not asking us and not reporting.
5866          */
5867         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5868                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5869
5870                 dev->gflags ^= IFF_ALLMULTI;
5871                 __dev_set_allmulti(dev, inc, false);
5872         }
5873
5874         return ret;
5875 }
5876
5877 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5878                         unsigned int gchanges)
5879 {
5880         unsigned int changes = dev->flags ^ old_flags;
5881
5882         if (gchanges)
5883                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5884
5885         if (changes & IFF_UP) {
5886                 if (dev->flags & IFF_UP)
5887                         call_netdevice_notifiers(NETDEV_UP, dev);
5888                 else
5889                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5890         }
5891
5892         if (dev->flags & IFF_UP &&
5893             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5894                 struct netdev_notifier_change_info change_info;
5895
5896                 change_info.flags_changed = changes;
5897                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5898                                               &change_info.info);
5899         }
5900 }
5901
5902 /**
5903  *      dev_change_flags - change device settings
5904  *      @dev: device
5905  *      @flags: device state flags
5906  *
5907  *      Change settings on device based state flags. The flags are
5908  *      in the userspace exported format.
5909  */
5910 int dev_change_flags(struct net_device *dev, unsigned int flags)
5911 {
5912         int ret;
5913         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5914
5915         ret = __dev_change_flags(dev, flags);
5916         if (ret < 0)
5917                 return ret;
5918
5919         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5920         __dev_notify_flags(dev, old_flags, changes);
5921         return ret;
5922 }
5923 EXPORT_SYMBOL(dev_change_flags);
5924
5925 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5926 {
5927         const struct net_device_ops *ops = dev->netdev_ops;
5928
5929         if (ops->ndo_change_mtu)
5930                 return ops->ndo_change_mtu(dev, new_mtu);
5931
5932         dev->mtu = new_mtu;
5933         return 0;
5934 }
5935
5936 /**
5937  *      dev_set_mtu - Change maximum transfer unit
5938  *      @dev: device
5939  *      @new_mtu: new transfer unit
5940  *
5941  *      Change the maximum transfer size of the network device.
5942  */
5943 int dev_set_mtu(struct net_device *dev, int new_mtu)
5944 {
5945         int err, orig_mtu;
5946
5947         if (new_mtu == dev->mtu)
5948                 return 0;
5949
5950         /*      MTU must be positive.    */
5951         if (new_mtu < 0)
5952                 return -EINVAL;
5953
5954         if (!netif_device_present(dev))
5955                 return -ENODEV;
5956
5957         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5958         err = notifier_to_errno(err);
5959         if (err)
5960                 return err;
5961
5962         orig_mtu = dev->mtu;
5963         err = __dev_set_mtu(dev, new_mtu);
5964
5965         if (!err) {
5966                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5967                 err = notifier_to_errno(err);
5968                 if (err) {
5969                         /* setting mtu back and notifying everyone again,
5970                          * so that they have a chance to revert changes.
5971                          */
5972                         __dev_set_mtu(dev, orig_mtu);
5973                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5974                 }
5975         }
5976         return err;
5977 }
5978 EXPORT_SYMBOL(dev_set_mtu);
5979
5980 /**
5981  *      dev_set_group - Change group this device belongs to
5982  *      @dev: device
5983  *      @new_group: group this device should belong to
5984  */
5985 void dev_set_group(struct net_device *dev, int new_group)
5986 {
5987         dev->group = new_group;
5988 }
5989 EXPORT_SYMBOL(dev_set_group);
5990
5991 /**
5992  *      dev_set_mac_address - Change Media Access Control Address
5993  *      @dev: device
5994  *      @sa: new address
5995  *
5996  *      Change the hardware (MAC) address of the device
5997  */
5998 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5999 {
6000         const struct net_device_ops *ops = dev->netdev_ops;
6001         int err;
6002
6003         if (!ops->ndo_set_mac_address)
6004                 return -EOPNOTSUPP;
6005         if (sa->sa_family != dev->type)
6006                 return -EINVAL;
6007         if (!netif_device_present(dev))
6008                 return -ENODEV;
6009         err = ops->ndo_set_mac_address(dev, sa);
6010         if (err)
6011                 return err;
6012         dev->addr_assign_type = NET_ADDR_SET;
6013         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6014         add_device_randomness(dev->dev_addr, dev->addr_len);
6015         return 0;
6016 }
6017 EXPORT_SYMBOL(dev_set_mac_address);
6018
6019 /**
6020  *      dev_change_carrier - Change device carrier
6021  *      @dev: device
6022  *      @new_carrier: new value
6023  *
6024  *      Change device carrier
6025  */
6026 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6027 {
6028         const struct net_device_ops *ops = dev->netdev_ops;
6029
6030         if (!ops->ndo_change_carrier)
6031                 return -EOPNOTSUPP;
6032         if (!netif_device_present(dev))
6033                 return -ENODEV;
6034         return ops->ndo_change_carrier(dev, new_carrier);
6035 }
6036 EXPORT_SYMBOL(dev_change_carrier);
6037
6038 /**
6039  *      dev_get_phys_port_id - Get device physical port ID
6040  *      @dev: device
6041  *      @ppid: port ID
6042  *
6043  *      Get device physical port ID
6044  */
6045 int dev_get_phys_port_id(struct net_device *dev,
6046                          struct netdev_phys_item_id *ppid)
6047 {
6048         const struct net_device_ops *ops = dev->netdev_ops;
6049
6050         if (!ops->ndo_get_phys_port_id)
6051                 return -EOPNOTSUPP;
6052         return ops->ndo_get_phys_port_id(dev, ppid);
6053 }
6054 EXPORT_SYMBOL(dev_get_phys_port_id);
6055
6056 /**
6057  *      dev_get_phys_port_name - Get device physical port name
6058  *      @dev: device
6059  *      @name: port name
6060  *
6061  *      Get device physical port name
6062  */
6063 int dev_get_phys_port_name(struct net_device *dev,
6064                            char *name, size_t len)
6065 {
6066         const struct net_device_ops *ops = dev->netdev_ops;
6067
6068         if (!ops->ndo_get_phys_port_name)
6069                 return -EOPNOTSUPP;
6070         return ops->ndo_get_phys_port_name(dev, name, len);
6071 }
6072 EXPORT_SYMBOL(dev_get_phys_port_name);
6073
6074 /**
6075  *      dev_new_index   -       allocate an ifindex
6076  *      @net: the applicable net namespace
6077  *
6078  *      Returns a suitable unique value for a new device interface
6079  *      number.  The caller must hold the rtnl semaphore or the
6080  *      dev_base_lock to be sure it remains unique.
6081  */
6082 static int dev_new_index(struct net *net)
6083 {
6084         int ifindex = net->ifindex;
6085         for (;;) {
6086                 if (++ifindex <= 0)
6087                         ifindex = 1;
6088                 if (!__dev_get_by_index(net, ifindex))
6089                         return net->ifindex = ifindex;
6090         }
6091 }
6092
6093 /* Delayed registration/unregisteration */
6094 static LIST_HEAD(net_todo_list);
6095 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6096
6097 static void net_set_todo(struct net_device *dev)
6098 {
6099         list_add_tail(&dev->todo_list, &net_todo_list);
6100         dev_net(dev)->dev_unreg_count++;
6101 }
6102
6103 static void rollback_registered_many(struct list_head *head)
6104 {
6105         struct net_device *dev, *tmp;
6106         LIST_HEAD(close_head);
6107
6108         BUG_ON(dev_boot_phase);
6109         ASSERT_RTNL();
6110
6111         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6112                 /* Some devices call without registering
6113                  * for initialization unwind. Remove those
6114                  * devices and proceed with the remaining.
6115                  */
6116                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6117                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6118                                  dev->name, dev);
6119
6120                         WARN_ON(1);
6121                         list_del(&dev->unreg_list);
6122                         continue;
6123                 }
6124                 dev->dismantle = true;
6125                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6126         }
6127
6128         /* If device is running, close it first. */
6129         list_for_each_entry(dev, head, unreg_list)
6130                 list_add_tail(&dev->close_list, &close_head);
6131         dev_close_many(&close_head, true);
6132
6133         list_for_each_entry(dev, head, unreg_list) {
6134                 /* And unlink it from device chain. */
6135                 unlist_netdevice(dev);
6136
6137                 dev->reg_state = NETREG_UNREGISTERING;
6138         }
6139
6140         synchronize_net();
6141
6142         list_for_each_entry(dev, head, unreg_list) {
6143                 struct sk_buff *skb = NULL;
6144
6145                 /* Shutdown queueing discipline. */
6146                 dev_shutdown(dev);
6147
6148
6149                 /* Notify protocols, that we are about to destroy
6150                    this device. They should clean all the things.
6151                 */
6152                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6153
6154                 if (!dev->rtnl_link_ops ||
6155                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6156                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6157                                                      GFP_KERNEL);
6158
6159                 /*
6160                  *      Flush the unicast and multicast chains
6161                  */
6162                 dev_uc_flush(dev);
6163                 dev_mc_flush(dev);
6164
6165                 if (dev->netdev_ops->ndo_uninit)
6166                         dev->netdev_ops->ndo_uninit(dev);
6167
6168                 if (skb)
6169                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6170
6171                 /* Notifier chain MUST detach us all upper devices. */
6172                 WARN_ON(netdev_has_any_upper_dev(dev));
6173
6174                 /* Remove entries from kobject tree */
6175                 netdev_unregister_kobject(dev);
6176 #ifdef CONFIG_XPS
6177                 /* Remove XPS queueing entries */
6178                 netif_reset_xps_queues_gt(dev, 0);
6179 #endif
6180         }
6181
6182         synchronize_net();
6183
6184         list_for_each_entry(dev, head, unreg_list)
6185                 dev_put(dev);
6186 }
6187
6188 static void rollback_registered(struct net_device *dev)
6189 {
6190         LIST_HEAD(single);
6191
6192         list_add(&dev->unreg_list, &single);
6193         rollback_registered_many(&single);
6194         list_del(&single);
6195 }
6196
6197 static netdev_features_t netdev_fix_features(struct net_device *dev,
6198         netdev_features_t features)
6199 {
6200         /* Fix illegal checksum combinations */
6201         if ((features & NETIF_F_HW_CSUM) &&
6202             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6203                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6204                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6205         }
6206
6207         /* TSO requires that SG is present as well. */
6208         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6209                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6210                 features &= ~NETIF_F_ALL_TSO;
6211         }
6212
6213         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6214                                         !(features & NETIF_F_IP_CSUM)) {
6215                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6216                 features &= ~NETIF_F_TSO;
6217                 features &= ~NETIF_F_TSO_ECN;
6218         }
6219
6220         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6221                                          !(features & NETIF_F_IPV6_CSUM)) {
6222                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6223                 features &= ~NETIF_F_TSO6;
6224         }
6225
6226         /* TSO ECN requires that TSO is present as well. */
6227         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6228                 features &= ~NETIF_F_TSO_ECN;
6229
6230         /* Software GSO depends on SG. */
6231         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6232                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6233                 features &= ~NETIF_F_GSO;
6234         }
6235
6236         /* UFO needs SG and checksumming */
6237         if (features & NETIF_F_UFO) {
6238                 /* maybe split UFO into V4 and V6? */
6239                 if (!((features & NETIF_F_GEN_CSUM) ||
6240                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6241                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6242                         netdev_dbg(dev,
6243                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6244                         features &= ~NETIF_F_UFO;
6245                 }
6246
6247                 if (!(features & NETIF_F_SG)) {
6248                         netdev_dbg(dev,
6249                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6250                         features &= ~NETIF_F_UFO;
6251                 }
6252         }
6253
6254 #ifdef CONFIG_NET_RX_BUSY_POLL
6255         if (dev->netdev_ops->ndo_busy_poll)
6256                 features |= NETIF_F_BUSY_POLL;
6257         else
6258 #endif
6259                 features &= ~NETIF_F_BUSY_POLL;
6260
6261         return features;
6262 }
6263
6264 int __netdev_update_features(struct net_device *dev)
6265 {
6266         netdev_features_t features;
6267         int err = 0;
6268
6269         ASSERT_RTNL();
6270
6271         features = netdev_get_wanted_features(dev);
6272
6273         if (dev->netdev_ops->ndo_fix_features)
6274                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6275
6276         /* driver might be less strict about feature dependencies */
6277         features = netdev_fix_features(dev, features);
6278
6279         if (dev->features == features)
6280                 return 0;
6281
6282         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6283                 &dev->features, &features);
6284
6285         if (dev->netdev_ops->ndo_set_features)
6286                 err = dev->netdev_ops->ndo_set_features(dev, features);
6287
6288         if (unlikely(err < 0)) {
6289                 netdev_err(dev,
6290                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6291                         err, &features, &dev->features);
6292                 return -1;
6293         }
6294
6295         if (!err)
6296                 dev->features = features;
6297
6298         return 1;
6299 }
6300
6301 /**
6302  *      netdev_update_features - recalculate device features
6303  *      @dev: the device to check
6304  *
6305  *      Recalculate dev->features set and send notifications if it
6306  *      has changed. Should be called after driver or hardware dependent
6307  *      conditions might have changed that influence the features.
6308  */
6309 void netdev_update_features(struct net_device *dev)
6310 {
6311         if (__netdev_update_features(dev))
6312                 netdev_features_change(dev);
6313 }
6314 EXPORT_SYMBOL(netdev_update_features);
6315
6316 /**
6317  *      netdev_change_features - recalculate device features
6318  *      @dev: the device to check
6319  *
6320  *      Recalculate dev->features set and send notifications even
6321  *      if they have not changed. Should be called instead of
6322  *      netdev_update_features() if also dev->vlan_features might
6323  *      have changed to allow the changes to be propagated to stacked
6324  *      VLAN devices.
6325  */
6326 void netdev_change_features(struct net_device *dev)
6327 {
6328         __netdev_update_features(dev);
6329         netdev_features_change(dev);
6330 }
6331 EXPORT_SYMBOL(netdev_change_features);
6332
6333 /**
6334  *      netif_stacked_transfer_operstate -      transfer operstate
6335  *      @rootdev: the root or lower level device to transfer state from
6336  *      @dev: the device to transfer operstate to
6337  *
6338  *      Transfer operational state from root to device. This is normally
6339  *      called when a stacking relationship exists between the root
6340  *      device and the device(a leaf device).
6341  */
6342 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6343                                         struct net_device *dev)
6344 {
6345         if (rootdev->operstate == IF_OPER_DORMANT)
6346                 netif_dormant_on(dev);
6347         else
6348                 netif_dormant_off(dev);
6349
6350         if (netif_carrier_ok(rootdev)) {
6351                 if (!netif_carrier_ok(dev))
6352                         netif_carrier_on(dev);
6353         } else {
6354                 if (netif_carrier_ok(dev))
6355                         netif_carrier_off(dev);
6356         }
6357 }
6358 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6359
6360 #ifdef CONFIG_SYSFS
6361 static int netif_alloc_rx_queues(struct net_device *dev)
6362 {
6363         unsigned int i, count = dev->num_rx_queues;
6364         struct netdev_rx_queue *rx;
6365         size_t sz = count * sizeof(*rx);
6366
6367         BUG_ON(count < 1);
6368
6369         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6370         if (!rx) {
6371                 rx = vzalloc(sz);
6372                 if (!rx)
6373                         return -ENOMEM;
6374         }
6375         dev->_rx = rx;
6376
6377         for (i = 0; i < count; i++)
6378                 rx[i].dev = dev;
6379         return 0;
6380 }
6381 #endif
6382
6383 static void netdev_init_one_queue(struct net_device *dev,
6384                                   struct netdev_queue *queue, void *_unused)
6385 {
6386         /* Initialize queue lock */
6387         spin_lock_init(&queue->_xmit_lock);
6388         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6389         queue->xmit_lock_owner = -1;
6390         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6391         queue->dev = dev;
6392 #ifdef CONFIG_BQL
6393         dql_init(&queue->dql, HZ);
6394 #endif
6395 }
6396
6397 static void netif_free_tx_queues(struct net_device *dev)
6398 {
6399         kvfree(dev->_tx);
6400 }
6401
6402 static int netif_alloc_netdev_queues(struct net_device *dev)
6403 {
6404         unsigned int count = dev->num_tx_queues;
6405         struct netdev_queue *tx;
6406         size_t sz = count * sizeof(*tx);
6407
6408         if (count < 1 || count > 0xffff)
6409                 return -EINVAL;
6410
6411         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6412         if (!tx) {
6413                 tx = vzalloc(sz);
6414                 if (!tx)
6415                         return -ENOMEM;
6416         }
6417         dev->_tx = tx;
6418
6419         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6420         spin_lock_init(&dev->tx_global_lock);
6421
6422         return 0;
6423 }
6424
6425 void netif_tx_stop_all_queues(struct net_device *dev)
6426 {
6427         unsigned int i;
6428
6429         for (i = 0; i < dev->num_tx_queues; i++) {
6430                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6431                 netif_tx_stop_queue(txq);
6432         }
6433 }
6434 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6435
6436 /**
6437  *      register_netdevice      - register a network device
6438  *      @dev: device to register
6439  *
6440  *      Take a completed network device structure and add it to the kernel
6441  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6442  *      chain. 0 is returned on success. A negative errno code is returned
6443  *      on a failure to set up the device, or if the name is a duplicate.
6444  *
6445  *      Callers must hold the rtnl semaphore. You may want
6446  *      register_netdev() instead of this.
6447  *
6448  *      BUGS:
6449  *      The locking appears insufficient to guarantee two parallel registers
6450  *      will not get the same name.
6451  */
6452
6453 int register_netdevice(struct net_device *dev)
6454 {
6455         int ret;
6456         struct net *net = dev_net(dev);
6457
6458         BUG_ON(dev_boot_phase);
6459         ASSERT_RTNL();
6460
6461         might_sleep();
6462
6463         /* When net_device's are persistent, this will be fatal. */
6464         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6465         BUG_ON(!net);
6466
6467         spin_lock_init(&dev->addr_list_lock);
6468         netdev_set_addr_lockdep_class(dev);
6469
6470         ret = dev_get_valid_name(net, dev, dev->name);
6471         if (ret < 0)
6472                 goto out;
6473
6474         /* Init, if this function is available */
6475         if (dev->netdev_ops->ndo_init) {
6476                 ret = dev->netdev_ops->ndo_init(dev);
6477                 if (ret) {
6478                         if (ret > 0)
6479                                 ret = -EIO;
6480                         goto out;
6481                 }
6482         }
6483
6484         if (((dev->hw_features | dev->features) &
6485              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6486             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6487              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6488                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6489                 ret = -EINVAL;
6490                 goto err_uninit;
6491         }
6492
6493         ret = -EBUSY;
6494         if (!dev->ifindex)
6495                 dev->ifindex = dev_new_index(net);
6496         else if (__dev_get_by_index(net, dev->ifindex))
6497                 goto err_uninit;
6498
6499         /* Transfer changeable features to wanted_features and enable
6500          * software offloads (GSO and GRO).
6501          */
6502         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6503         dev->features |= NETIF_F_SOFT_FEATURES;
6504         dev->wanted_features = dev->features & dev->hw_features;
6505
6506         if (!(dev->flags & IFF_LOOPBACK)) {
6507                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6508         }
6509
6510         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6511          */
6512         dev->vlan_features |= NETIF_F_HIGHDMA;
6513
6514         /* Make NETIF_F_SG inheritable to tunnel devices.
6515          */
6516         dev->hw_enc_features |= NETIF_F_SG;
6517
6518         /* Make NETIF_F_SG inheritable to MPLS.
6519          */
6520         dev->mpls_features |= NETIF_F_SG;
6521
6522         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6523         ret = notifier_to_errno(ret);
6524         if (ret)
6525                 goto err_uninit;
6526
6527         ret = netdev_register_kobject(dev);
6528         if (ret)
6529                 goto err_uninit;
6530         dev->reg_state = NETREG_REGISTERED;
6531
6532         __netdev_update_features(dev);
6533
6534         /*
6535          *      Default initial state at registry is that the
6536          *      device is present.
6537          */
6538
6539         set_bit(__LINK_STATE_PRESENT, &dev->state);
6540
6541         linkwatch_init_dev(dev);
6542
6543         dev_init_scheduler(dev);
6544         dev_hold(dev);
6545         list_netdevice(dev);
6546         add_device_randomness(dev->dev_addr, dev->addr_len);
6547
6548         /* If the device has permanent device address, driver should
6549          * set dev_addr and also addr_assign_type should be set to
6550          * NET_ADDR_PERM (default value).
6551          */
6552         if (dev->addr_assign_type == NET_ADDR_PERM)
6553                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6554
6555         /* Notify protocols, that a new device appeared. */
6556         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6557         ret = notifier_to_errno(ret);
6558         if (ret) {
6559                 rollback_registered(dev);
6560                 dev->reg_state = NETREG_UNREGISTERED;
6561         }
6562         /*
6563          *      Prevent userspace races by waiting until the network
6564          *      device is fully setup before sending notifications.
6565          */
6566         if (!dev->rtnl_link_ops ||
6567             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6568                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6569
6570 out:
6571         return ret;
6572
6573 err_uninit:
6574         if (dev->netdev_ops->ndo_uninit)
6575                 dev->netdev_ops->ndo_uninit(dev);
6576         goto out;
6577 }
6578 EXPORT_SYMBOL(register_netdevice);
6579
6580 /**
6581  *      init_dummy_netdev       - init a dummy network device for NAPI
6582  *      @dev: device to init
6583  *
6584  *      This takes a network device structure and initialize the minimum
6585  *      amount of fields so it can be used to schedule NAPI polls without
6586  *      registering a full blown interface. This is to be used by drivers
6587  *      that need to tie several hardware interfaces to a single NAPI
6588  *      poll scheduler due to HW limitations.
6589  */
6590 int init_dummy_netdev(struct net_device *dev)
6591 {
6592         /* Clear everything. Note we don't initialize spinlocks
6593          * are they aren't supposed to be taken by any of the
6594          * NAPI code and this dummy netdev is supposed to be
6595          * only ever used for NAPI polls
6596          */
6597         memset(dev, 0, sizeof(struct net_device));
6598
6599         /* make sure we BUG if trying to hit standard
6600          * register/unregister code path
6601          */
6602         dev->reg_state = NETREG_DUMMY;
6603
6604         /* NAPI wants this */
6605         INIT_LIST_HEAD(&dev->napi_list);
6606
6607         /* a dummy interface is started by default */
6608         set_bit(__LINK_STATE_PRESENT, &dev->state);
6609         set_bit(__LINK_STATE_START, &dev->state);
6610
6611         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6612          * because users of this 'device' dont need to change
6613          * its refcount.
6614          */
6615
6616         return 0;
6617 }
6618 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6619
6620
6621 /**
6622  *      register_netdev - register a network device
6623  *      @dev: device to register
6624  *
6625  *      Take a completed network device structure and add it to the kernel
6626  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6627  *      chain. 0 is returned on success. A negative errno code is returned
6628  *      on a failure to set up the device, or if the name is a duplicate.
6629  *
6630  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6631  *      and expands the device name if you passed a format string to
6632  *      alloc_netdev.
6633  */
6634 int register_netdev(struct net_device *dev)
6635 {
6636         int err;
6637
6638         rtnl_lock();
6639         err = register_netdevice(dev);
6640         rtnl_unlock();
6641         return err;
6642 }
6643 EXPORT_SYMBOL(register_netdev);
6644
6645 int netdev_refcnt_read(const struct net_device *dev)
6646 {
6647         int i, refcnt = 0;
6648
6649         for_each_possible_cpu(i)
6650                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6651         return refcnt;
6652 }
6653 EXPORT_SYMBOL(netdev_refcnt_read);
6654
6655 /**
6656  * netdev_wait_allrefs - wait until all references are gone.
6657  * @dev: target net_device
6658  *
6659  * This is called when unregistering network devices.
6660  *
6661  * Any protocol or device that holds a reference should register
6662  * for netdevice notification, and cleanup and put back the
6663  * reference if they receive an UNREGISTER event.
6664  * We can get stuck here if buggy protocols don't correctly
6665  * call dev_put.
6666  */
6667 static void netdev_wait_allrefs(struct net_device *dev)
6668 {
6669         unsigned long rebroadcast_time, warning_time;
6670         int refcnt;
6671
6672         linkwatch_forget_dev(dev);
6673
6674         rebroadcast_time = warning_time = jiffies;
6675         refcnt = netdev_refcnt_read(dev);
6676
6677         while (refcnt != 0) {
6678                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6679                         rtnl_lock();
6680
6681                         /* Rebroadcast unregister notification */
6682                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6683
6684                         __rtnl_unlock();
6685                         rcu_barrier();
6686                         rtnl_lock();
6687
6688                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6689                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6690                                      &dev->state)) {
6691                                 /* We must not have linkwatch events
6692                                  * pending on unregister. If this
6693                                  * happens, we simply run the queue
6694                                  * unscheduled, resulting in a noop
6695                                  * for this device.
6696                                  */
6697                                 linkwatch_run_queue();
6698                         }
6699
6700                         __rtnl_unlock();
6701
6702                         rebroadcast_time = jiffies;
6703                 }
6704
6705                 msleep(250);
6706
6707                 refcnt = netdev_refcnt_read(dev);
6708
6709                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6710                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6711                                  dev->name, refcnt);
6712                         warning_time = jiffies;
6713                 }
6714         }
6715 }
6716
6717 /* The sequence is:
6718  *
6719  *      rtnl_lock();
6720  *      ...
6721  *      register_netdevice(x1);
6722  *      register_netdevice(x2);
6723  *      ...
6724  *      unregister_netdevice(y1);
6725  *      unregister_netdevice(y2);
6726  *      ...
6727  *      rtnl_unlock();
6728  *      free_netdev(y1);
6729  *      free_netdev(y2);
6730  *
6731  * We are invoked by rtnl_unlock().
6732  * This allows us to deal with problems:
6733  * 1) We can delete sysfs objects which invoke hotplug
6734  *    without deadlocking with linkwatch via keventd.
6735  * 2) Since we run with the RTNL semaphore not held, we can sleep
6736  *    safely in order to wait for the netdev refcnt to drop to zero.
6737  *
6738  * We must not return until all unregister events added during
6739  * the interval the lock was held have been completed.
6740  */
6741 void netdev_run_todo(void)
6742 {
6743         struct list_head list;
6744
6745         /* Snapshot list, allow later requests */
6746         list_replace_init(&net_todo_list, &list);
6747
6748         __rtnl_unlock();
6749
6750
6751         /* Wait for rcu callbacks to finish before next phase */
6752         if (!list_empty(&list))
6753                 rcu_barrier();
6754
6755         while (!list_empty(&list)) {
6756                 struct net_device *dev
6757                         = list_first_entry(&list, struct net_device, todo_list);
6758                 list_del(&dev->todo_list);
6759
6760                 rtnl_lock();
6761                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6762                 __rtnl_unlock();
6763
6764                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6765                         pr_err("network todo '%s' but state %d\n",
6766                                dev->name, dev->reg_state);
6767                         dump_stack();
6768                         continue;
6769                 }
6770
6771                 dev->reg_state = NETREG_UNREGISTERED;
6772
6773                 on_each_cpu(flush_backlog, dev, 1);
6774
6775                 netdev_wait_allrefs(dev);
6776
6777                 /* paranoia */
6778                 BUG_ON(netdev_refcnt_read(dev));
6779                 BUG_ON(!list_empty(&dev->ptype_all));
6780                 BUG_ON(!list_empty(&dev->ptype_specific));
6781                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6782                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6783                 WARN_ON(dev->dn_ptr);
6784
6785                 if (dev->destructor)
6786                         dev->destructor(dev);
6787
6788                 /* Report a network device has been unregistered */
6789                 rtnl_lock();
6790                 dev_net(dev)->dev_unreg_count--;
6791                 __rtnl_unlock();
6792                 wake_up(&netdev_unregistering_wq);
6793
6794                 /* Free network device */
6795                 kobject_put(&dev->dev.kobj);
6796         }
6797 }
6798
6799 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6800  * fields in the same order, with only the type differing.
6801  */
6802 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6803                              const struct net_device_stats *netdev_stats)
6804 {
6805 #if BITS_PER_LONG == 64
6806         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6807         memcpy(stats64, netdev_stats, sizeof(*stats64));
6808 #else
6809         size_t i, n = sizeof(*stats64) / sizeof(u64);
6810         const unsigned long *src = (const unsigned long *)netdev_stats;
6811         u64 *dst = (u64 *)stats64;
6812
6813         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6814                      sizeof(*stats64) / sizeof(u64));
6815         for (i = 0; i < n; i++)
6816                 dst[i] = src[i];
6817 #endif
6818 }
6819 EXPORT_SYMBOL(netdev_stats_to_stats64);
6820
6821 /**
6822  *      dev_get_stats   - get network device statistics
6823  *      @dev: device to get statistics from
6824  *      @storage: place to store stats
6825  *
6826  *      Get network statistics from device. Return @storage.
6827  *      The device driver may provide its own method by setting
6828  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6829  *      otherwise the internal statistics structure is used.
6830  */
6831 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6832                                         struct rtnl_link_stats64 *storage)
6833 {
6834         const struct net_device_ops *ops = dev->netdev_ops;
6835
6836         if (ops->ndo_get_stats64) {
6837                 memset(storage, 0, sizeof(*storage));
6838                 ops->ndo_get_stats64(dev, storage);
6839         } else if (ops->ndo_get_stats) {
6840                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6841         } else {
6842                 netdev_stats_to_stats64(storage, &dev->stats);
6843         }
6844         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6845         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6846         return storage;
6847 }
6848 EXPORT_SYMBOL(dev_get_stats);
6849
6850 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6851 {
6852         struct netdev_queue *queue = dev_ingress_queue(dev);
6853
6854 #ifdef CONFIG_NET_CLS_ACT
6855         if (queue)
6856                 return queue;
6857         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6858         if (!queue)
6859                 return NULL;
6860         netdev_init_one_queue(dev, queue, NULL);
6861         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6862         queue->qdisc_sleeping = &noop_qdisc;
6863         rcu_assign_pointer(dev->ingress_queue, queue);
6864 #endif
6865         return queue;
6866 }
6867
6868 static const struct ethtool_ops default_ethtool_ops;
6869
6870 void netdev_set_default_ethtool_ops(struct net_device *dev,
6871                                     const struct ethtool_ops *ops)
6872 {
6873         if (dev->ethtool_ops == &default_ethtool_ops)
6874                 dev->ethtool_ops = ops;
6875 }
6876 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6877
6878 void netdev_freemem(struct net_device *dev)
6879 {
6880         char *addr = (char *)dev - dev->padded;
6881
6882         kvfree(addr);
6883 }
6884
6885 /**
6886  *      alloc_netdev_mqs - allocate network device
6887  *      @sizeof_priv:           size of private data to allocate space for
6888  *      @name:                  device name format string
6889  *      @name_assign_type:      origin of device name
6890  *      @setup:                 callback to initialize device
6891  *      @txqs:                  the number of TX subqueues to allocate
6892  *      @rxqs:                  the number of RX subqueues to allocate
6893  *
6894  *      Allocates a struct net_device with private data area for driver use
6895  *      and performs basic initialization.  Also allocates subqueue structs
6896  *      for each queue on the device.
6897  */
6898 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6899                 unsigned char name_assign_type,
6900                 void (*setup)(struct net_device *),
6901                 unsigned int txqs, unsigned int rxqs)
6902 {
6903         struct net_device *dev;
6904         size_t alloc_size;
6905         struct net_device *p;
6906
6907         BUG_ON(strlen(name) >= sizeof(dev->name));
6908
6909         if (txqs < 1) {
6910                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6911                 return NULL;
6912         }
6913
6914 #ifdef CONFIG_SYSFS
6915         if (rxqs < 1) {
6916                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6917                 return NULL;
6918         }
6919 #endif
6920
6921         alloc_size = sizeof(struct net_device);
6922         if (sizeof_priv) {
6923                 /* ensure 32-byte alignment of private area */
6924                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6925                 alloc_size += sizeof_priv;
6926         }
6927         /* ensure 32-byte alignment of whole construct */
6928         alloc_size += NETDEV_ALIGN - 1;
6929
6930         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6931         if (!p)
6932                 p = vzalloc(alloc_size);
6933         if (!p)
6934                 return NULL;
6935
6936         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6937         dev->padded = (char *)dev - (char *)p;
6938
6939         dev->pcpu_refcnt = alloc_percpu(int);
6940         if (!dev->pcpu_refcnt)
6941                 goto free_dev;
6942
6943         if (dev_addr_init(dev))
6944                 goto free_pcpu;
6945
6946         dev_mc_init(dev);
6947         dev_uc_init(dev);
6948
6949         dev_net_set(dev, &init_net);
6950
6951         dev->gso_max_size = GSO_MAX_SIZE;
6952         dev->gso_max_segs = GSO_MAX_SEGS;
6953         dev->gso_min_segs = 0;
6954
6955         INIT_LIST_HEAD(&dev->napi_list);
6956         INIT_LIST_HEAD(&dev->unreg_list);
6957         INIT_LIST_HEAD(&dev->close_list);
6958         INIT_LIST_HEAD(&dev->link_watch_list);
6959         INIT_LIST_HEAD(&dev->adj_list.upper);
6960         INIT_LIST_HEAD(&dev->adj_list.lower);
6961         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6962         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6963         INIT_LIST_HEAD(&dev->ptype_all);
6964         INIT_LIST_HEAD(&dev->ptype_specific);
6965         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6966         setup(dev);
6967
6968         dev->num_tx_queues = txqs;
6969         dev->real_num_tx_queues = txqs;
6970         if (netif_alloc_netdev_queues(dev))
6971                 goto free_all;
6972
6973 #ifdef CONFIG_SYSFS
6974         dev->num_rx_queues = rxqs;
6975         dev->real_num_rx_queues = rxqs;
6976         if (netif_alloc_rx_queues(dev))
6977                 goto free_all;
6978 #endif
6979
6980         strcpy(dev->name, name);
6981         dev->name_assign_type = name_assign_type;
6982         dev->group = INIT_NETDEV_GROUP;
6983         if (!dev->ethtool_ops)
6984                 dev->ethtool_ops = &default_ethtool_ops;
6985
6986         nf_hook_ingress_init(dev);
6987
6988         return dev;
6989
6990 free_all:
6991         free_netdev(dev);
6992         return NULL;
6993
6994 free_pcpu:
6995         free_percpu(dev->pcpu_refcnt);
6996 free_dev:
6997         netdev_freemem(dev);
6998         return NULL;
6999 }
7000 EXPORT_SYMBOL(alloc_netdev_mqs);
7001
7002 /**
7003  *      free_netdev - free network device
7004  *      @dev: device
7005  *
7006  *      This function does the last stage of destroying an allocated device
7007  *      interface. The reference to the device object is released.
7008  *      If this is the last reference then it will be freed.
7009  */
7010 void free_netdev(struct net_device *dev)
7011 {
7012         struct napi_struct *p, *n;
7013
7014         netif_free_tx_queues(dev);
7015 #ifdef CONFIG_SYSFS
7016         kvfree(dev->_rx);
7017 #endif
7018
7019         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7020
7021         /* Flush device addresses */
7022         dev_addr_flush(dev);
7023
7024         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7025                 netif_napi_del(p);
7026
7027         free_percpu(dev->pcpu_refcnt);
7028         dev->pcpu_refcnt = NULL;
7029
7030         /*  Compatibility with error handling in drivers */
7031         if (dev->reg_state == NETREG_UNINITIALIZED) {
7032                 netdev_freemem(dev);
7033                 return;
7034         }
7035
7036         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7037         dev->reg_state = NETREG_RELEASED;
7038
7039         /* will free via device release */
7040         put_device(&dev->dev);
7041 }
7042 EXPORT_SYMBOL(free_netdev);
7043
7044 /**
7045  *      synchronize_net -  Synchronize with packet receive processing
7046  *
7047  *      Wait for packets currently being received to be done.
7048  *      Does not block later packets from starting.
7049  */
7050 void synchronize_net(void)
7051 {
7052         might_sleep();
7053         if (rtnl_is_locked())
7054                 synchronize_rcu_expedited();
7055         else
7056                 synchronize_rcu();
7057 }
7058 EXPORT_SYMBOL(synchronize_net);
7059
7060 /**
7061  *      unregister_netdevice_queue - remove device from the kernel
7062  *      @dev: device
7063  *      @head: list
7064  *
7065  *      This function shuts down a device interface and removes it
7066  *      from the kernel tables.
7067  *      If head not NULL, device is queued to be unregistered later.
7068  *
7069  *      Callers must hold the rtnl semaphore.  You may want
7070  *      unregister_netdev() instead of this.
7071  */
7072
7073 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7074 {
7075         ASSERT_RTNL();
7076
7077         if (head) {
7078                 list_move_tail(&dev->unreg_list, head);
7079         } else {
7080                 rollback_registered(dev);
7081                 /* Finish processing unregister after unlock */
7082                 net_set_todo(dev);
7083         }
7084 }
7085 EXPORT_SYMBOL(unregister_netdevice_queue);
7086
7087 /**
7088  *      unregister_netdevice_many - unregister many devices
7089  *      @head: list of devices
7090  *
7091  *  Note: As most callers use a stack allocated list_head,
7092  *  we force a list_del() to make sure stack wont be corrupted later.
7093  */
7094 void unregister_netdevice_many(struct list_head *head)
7095 {
7096         struct net_device *dev;
7097
7098         if (!list_empty(head)) {
7099                 rollback_registered_many(head);
7100                 list_for_each_entry(dev, head, unreg_list)
7101                         net_set_todo(dev);
7102                 list_del(head);
7103         }
7104 }
7105 EXPORT_SYMBOL(unregister_netdevice_many);
7106
7107 /**
7108  *      unregister_netdev - remove device from the kernel
7109  *      @dev: device
7110  *
7111  *      This function shuts down a device interface and removes it
7112  *      from the kernel tables.
7113  *
7114  *      This is just a wrapper for unregister_netdevice that takes
7115  *      the rtnl semaphore.  In general you want to use this and not
7116  *      unregister_netdevice.
7117  */
7118 void unregister_netdev(struct net_device *dev)
7119 {
7120         rtnl_lock();
7121         unregister_netdevice(dev);
7122         rtnl_unlock();
7123 }
7124 EXPORT_SYMBOL(unregister_netdev);
7125
7126 /**
7127  *      dev_change_net_namespace - move device to different nethost namespace
7128  *      @dev: device
7129  *      @net: network namespace
7130  *      @pat: If not NULL name pattern to try if the current device name
7131  *            is already taken in the destination network namespace.
7132  *
7133  *      This function shuts down a device interface and moves it
7134  *      to a new network namespace. On success 0 is returned, on
7135  *      a failure a netagive errno code is returned.
7136  *
7137  *      Callers must hold the rtnl semaphore.
7138  */
7139
7140 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7141 {
7142         int err;
7143
7144         ASSERT_RTNL();
7145
7146         /* Don't allow namespace local devices to be moved. */
7147         err = -EINVAL;
7148         if (dev->features & NETIF_F_NETNS_LOCAL)
7149                 goto out;
7150
7151         /* Ensure the device has been registrered */
7152         if (dev->reg_state != NETREG_REGISTERED)
7153                 goto out;
7154
7155         /* Get out if there is nothing todo */
7156         err = 0;
7157         if (net_eq(dev_net(dev), net))
7158                 goto out;
7159
7160         /* Pick the destination device name, and ensure
7161          * we can use it in the destination network namespace.
7162          */
7163         err = -EEXIST;
7164         if (__dev_get_by_name(net, dev->name)) {
7165                 /* We get here if we can't use the current device name */
7166                 if (!pat)
7167                         goto out;
7168                 if (dev_get_valid_name(net, dev, pat) < 0)
7169                         goto out;
7170         }
7171
7172         /*
7173          * And now a mini version of register_netdevice unregister_netdevice.
7174          */
7175
7176         /* If device is running close it first. */
7177         dev_close(dev);
7178
7179         /* And unlink it from device chain */
7180         err = -ENODEV;
7181         unlist_netdevice(dev);
7182
7183         synchronize_net();
7184
7185         /* Shutdown queueing discipline. */
7186         dev_shutdown(dev);
7187
7188         /* Notify protocols, that we are about to destroy
7189            this device. They should clean all the things.
7190
7191            Note that dev->reg_state stays at NETREG_REGISTERED.
7192            This is wanted because this way 8021q and macvlan know
7193            the device is just moving and can keep their slaves up.
7194         */
7195         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7196         rcu_barrier();
7197         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7198         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7199
7200         /*
7201          *      Flush the unicast and multicast chains
7202          */
7203         dev_uc_flush(dev);
7204         dev_mc_flush(dev);
7205
7206         /* Send a netdev-removed uevent to the old namespace */
7207         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7208         netdev_adjacent_del_links(dev);
7209
7210         /* Actually switch the network namespace */
7211         dev_net_set(dev, net);
7212
7213         /* If there is an ifindex conflict assign a new one */
7214         if (__dev_get_by_index(net, dev->ifindex))
7215                 dev->ifindex = dev_new_index(net);
7216
7217         /* Send a netdev-add uevent to the new namespace */
7218         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7219         netdev_adjacent_add_links(dev);
7220
7221         /* Fixup kobjects */
7222         err = device_rename(&dev->dev, dev->name);
7223         WARN_ON(err);
7224
7225         /* Add the device back in the hashes */
7226         list_netdevice(dev);
7227
7228         /* Notify protocols, that a new device appeared. */
7229         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7230
7231         /*
7232          *      Prevent userspace races by waiting until the network
7233          *      device is fully setup before sending notifications.
7234          */
7235         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7236
7237         synchronize_net();
7238         err = 0;
7239 out:
7240         return err;
7241 }
7242 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7243
7244 static int dev_cpu_callback(struct notifier_block *nfb,
7245                             unsigned long action,
7246                             void *ocpu)
7247 {
7248         struct sk_buff **list_skb;
7249         struct sk_buff *skb;
7250         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7251         struct softnet_data *sd, *oldsd;
7252
7253         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7254                 return NOTIFY_OK;
7255
7256         local_irq_disable();
7257         cpu = smp_processor_id();
7258         sd = &per_cpu(softnet_data, cpu);
7259         oldsd = &per_cpu(softnet_data, oldcpu);
7260
7261         /* Find end of our completion_queue. */
7262         list_skb = &sd->completion_queue;
7263         while (*list_skb)
7264                 list_skb = &(*list_skb)->next;
7265         /* Append completion queue from offline CPU. */
7266         *list_skb = oldsd->completion_queue;
7267         oldsd->completion_queue = NULL;
7268
7269         /* Append output queue from offline CPU. */
7270         if (oldsd->output_queue) {
7271                 *sd->output_queue_tailp = oldsd->output_queue;
7272                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7273                 oldsd->output_queue = NULL;
7274                 oldsd->output_queue_tailp = &oldsd->output_queue;
7275         }
7276         /* Append NAPI poll list from offline CPU, with one exception :
7277          * process_backlog() must be called by cpu owning percpu backlog.
7278          * We properly handle process_queue & input_pkt_queue later.
7279          */
7280         while (!list_empty(&oldsd->poll_list)) {
7281                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7282                                                             struct napi_struct,
7283                                                             poll_list);
7284
7285                 list_del_init(&napi->poll_list);
7286                 if (napi->poll == process_backlog)
7287                         napi->state = 0;
7288                 else
7289                         ____napi_schedule(sd, napi);
7290         }
7291
7292         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7293         local_irq_enable();
7294
7295         /* Process offline CPU's input_pkt_queue */
7296         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7297                 netif_rx_ni(skb);
7298                 input_queue_head_incr(oldsd);
7299         }
7300         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7301                 netif_rx_ni(skb);
7302                 input_queue_head_incr(oldsd);
7303         }
7304
7305         return NOTIFY_OK;
7306 }
7307
7308
7309 /**
7310  *      netdev_increment_features - increment feature set by one
7311  *      @all: current feature set
7312  *      @one: new feature set
7313  *      @mask: mask feature set
7314  *
7315  *      Computes a new feature set after adding a device with feature set
7316  *      @one to the master device with current feature set @all.  Will not
7317  *      enable anything that is off in @mask. Returns the new feature set.
7318  */
7319 netdev_features_t netdev_increment_features(netdev_features_t all,
7320         netdev_features_t one, netdev_features_t mask)
7321 {
7322         if (mask & NETIF_F_GEN_CSUM)
7323                 mask |= NETIF_F_ALL_CSUM;
7324         mask |= NETIF_F_VLAN_CHALLENGED;
7325
7326         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7327         all &= one | ~NETIF_F_ALL_FOR_ALL;
7328
7329         /* If one device supports hw checksumming, set for all. */
7330         if (all & NETIF_F_GEN_CSUM)
7331                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7332
7333         return all;
7334 }
7335 EXPORT_SYMBOL(netdev_increment_features);
7336
7337 static struct hlist_head * __net_init netdev_create_hash(void)
7338 {
7339         int i;
7340         struct hlist_head *hash;
7341
7342         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7343         if (hash != NULL)
7344                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7345                         INIT_HLIST_HEAD(&hash[i]);
7346
7347         return hash;
7348 }
7349
7350 /* Initialize per network namespace state */
7351 static int __net_init netdev_init(struct net *net)
7352 {
7353         if (net != &init_net)
7354                 INIT_LIST_HEAD(&net->dev_base_head);
7355
7356         net->dev_name_head = netdev_create_hash();
7357         if (net->dev_name_head == NULL)
7358                 goto err_name;
7359
7360         net->dev_index_head = netdev_create_hash();
7361         if (net->dev_index_head == NULL)
7362                 goto err_idx;
7363
7364         return 0;
7365
7366 err_idx:
7367         kfree(net->dev_name_head);
7368 err_name:
7369         return -ENOMEM;
7370 }
7371
7372 /**
7373  *      netdev_drivername - network driver for the device
7374  *      @dev: network device
7375  *
7376  *      Determine network driver for device.
7377  */
7378 const char *netdev_drivername(const struct net_device *dev)
7379 {
7380         const struct device_driver *driver;
7381         const struct device *parent;
7382         const char *empty = "";
7383
7384         parent = dev->dev.parent;
7385         if (!parent)
7386                 return empty;
7387
7388         driver = parent->driver;
7389         if (driver && driver->name)
7390                 return driver->name;
7391         return empty;
7392 }
7393
7394 static void __netdev_printk(const char *level, const struct net_device *dev,
7395                             struct va_format *vaf)
7396 {
7397         if (dev && dev->dev.parent) {
7398                 dev_printk_emit(level[1] - '0',
7399                                 dev->dev.parent,
7400                                 "%s %s %s%s: %pV",
7401                                 dev_driver_string(dev->dev.parent),
7402                                 dev_name(dev->dev.parent),
7403                                 netdev_name(dev), netdev_reg_state(dev),
7404                                 vaf);
7405         } else if (dev) {
7406                 printk("%s%s%s: %pV",
7407                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7408         } else {
7409                 printk("%s(NULL net_device): %pV", level, vaf);
7410         }
7411 }
7412
7413 void netdev_printk(const char *level, const struct net_device *dev,
7414                    const char *format, ...)
7415 {
7416         struct va_format vaf;
7417         va_list args;
7418
7419         va_start(args, format);
7420
7421         vaf.fmt = format;
7422         vaf.va = &args;
7423
7424         __netdev_printk(level, dev, &vaf);
7425
7426         va_end(args);
7427 }
7428 EXPORT_SYMBOL(netdev_printk);
7429
7430 #define define_netdev_printk_level(func, level)                 \
7431 void func(const struct net_device *dev, const char *fmt, ...)   \
7432 {                                                               \
7433         struct va_format vaf;                                   \
7434         va_list args;                                           \
7435                                                                 \
7436         va_start(args, fmt);                                    \
7437                                                                 \
7438         vaf.fmt = fmt;                                          \
7439         vaf.va = &args;                                         \
7440                                                                 \
7441         __netdev_printk(level, dev, &vaf);                      \
7442                                                                 \
7443         va_end(args);                                           \
7444 }                                                               \
7445 EXPORT_SYMBOL(func);
7446
7447 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7448 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7449 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7450 define_netdev_printk_level(netdev_err, KERN_ERR);
7451 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7452 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7453 define_netdev_printk_level(netdev_info, KERN_INFO);
7454
7455 static void __net_exit netdev_exit(struct net *net)
7456 {
7457         kfree(net->dev_name_head);
7458         kfree(net->dev_index_head);
7459 }
7460
7461 static struct pernet_operations __net_initdata netdev_net_ops = {
7462         .init = netdev_init,
7463         .exit = netdev_exit,
7464 };
7465
7466 static void __net_exit default_device_exit(struct net *net)
7467 {
7468         struct net_device *dev, *aux;
7469         /*
7470          * Push all migratable network devices back to the
7471          * initial network namespace
7472          */
7473         rtnl_lock();
7474         for_each_netdev_safe(net, dev, aux) {
7475                 int err;
7476                 char fb_name[IFNAMSIZ];
7477
7478                 /* Ignore unmoveable devices (i.e. loopback) */
7479                 if (dev->features & NETIF_F_NETNS_LOCAL)
7480                         continue;
7481
7482                 /* Leave virtual devices for the generic cleanup */
7483                 if (dev->rtnl_link_ops)
7484                         continue;
7485
7486                 /* Push remaining network devices to init_net */
7487                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7488                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7489                 if (err) {
7490                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7491                                  __func__, dev->name, err);
7492                         BUG();
7493                 }
7494         }
7495         rtnl_unlock();
7496 }
7497
7498 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7499 {
7500         /* Return with the rtnl_lock held when there are no network
7501          * devices unregistering in any network namespace in net_list.
7502          */
7503         struct net *net;
7504         bool unregistering;
7505         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7506
7507         add_wait_queue(&netdev_unregistering_wq, &wait);
7508         for (;;) {
7509                 unregistering = false;
7510                 rtnl_lock();
7511                 list_for_each_entry(net, net_list, exit_list) {
7512                         if (net->dev_unreg_count > 0) {
7513                                 unregistering = true;
7514                                 break;
7515                         }
7516                 }
7517                 if (!unregistering)
7518                         break;
7519                 __rtnl_unlock();
7520
7521                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7522         }
7523         remove_wait_queue(&netdev_unregistering_wq, &wait);
7524 }
7525
7526 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7527 {
7528         /* At exit all network devices most be removed from a network
7529          * namespace.  Do this in the reverse order of registration.
7530          * Do this across as many network namespaces as possible to
7531          * improve batching efficiency.
7532          */
7533         struct net_device *dev;
7534         struct net *net;
7535         LIST_HEAD(dev_kill_list);
7536
7537         /* To prevent network device cleanup code from dereferencing
7538          * loopback devices or network devices that have been freed
7539          * wait here for all pending unregistrations to complete,
7540          * before unregistring the loopback device and allowing the
7541          * network namespace be freed.
7542          *
7543          * The netdev todo list containing all network devices
7544          * unregistrations that happen in default_device_exit_batch
7545          * will run in the rtnl_unlock() at the end of
7546          * default_device_exit_batch.
7547          */
7548         rtnl_lock_unregistering(net_list);
7549         list_for_each_entry(net, net_list, exit_list) {
7550                 for_each_netdev_reverse(net, dev) {
7551                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7552                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7553                         else
7554                                 unregister_netdevice_queue(dev, &dev_kill_list);
7555                 }
7556         }
7557         unregister_netdevice_many(&dev_kill_list);
7558         rtnl_unlock();
7559 }
7560
7561 static struct pernet_operations __net_initdata default_device_ops = {
7562         .exit = default_device_exit,
7563         .exit_batch = default_device_exit_batch,
7564 };
7565
7566 /*
7567  *      Initialize the DEV module. At boot time this walks the device list and
7568  *      unhooks any devices that fail to initialise (normally hardware not
7569  *      present) and leaves us with a valid list of present and active devices.
7570  *
7571  */
7572
7573 /*
7574  *       This is called single threaded during boot, so no need
7575  *       to take the rtnl semaphore.
7576  */
7577 static int __init net_dev_init(void)
7578 {
7579         int i, rc = -ENOMEM;
7580
7581         BUG_ON(!dev_boot_phase);
7582
7583         if (dev_proc_init())
7584                 goto out;
7585
7586         if (netdev_kobject_init())
7587                 goto out;
7588
7589         INIT_LIST_HEAD(&ptype_all);
7590         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7591                 INIT_LIST_HEAD(&ptype_base[i]);
7592
7593         INIT_LIST_HEAD(&offload_base);
7594
7595         if (register_pernet_subsys(&netdev_net_ops))
7596                 goto out;
7597
7598         /*
7599          *      Initialise the packet receive queues.
7600          */
7601
7602         for_each_possible_cpu(i) {
7603                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7604
7605                 skb_queue_head_init(&sd->input_pkt_queue);
7606                 skb_queue_head_init(&sd->process_queue);
7607                 INIT_LIST_HEAD(&sd->poll_list);
7608                 sd->output_queue_tailp = &sd->output_queue;
7609 #ifdef CONFIG_RPS
7610                 sd->csd.func = rps_trigger_softirq;
7611                 sd->csd.info = sd;
7612                 sd->cpu = i;
7613 #endif
7614
7615                 sd->backlog.poll = process_backlog;
7616                 sd->backlog.weight = weight_p;
7617         }
7618
7619         dev_boot_phase = 0;
7620
7621         /* The loopback device is special if any other network devices
7622          * is present in a network namespace the loopback device must
7623          * be present. Since we now dynamically allocate and free the
7624          * loopback device ensure this invariant is maintained by
7625          * keeping the loopback device as the first device on the
7626          * list of network devices.  Ensuring the loopback devices
7627          * is the first device that appears and the last network device
7628          * that disappears.
7629          */
7630         if (register_pernet_device(&loopback_net_ops))
7631                 goto out;
7632
7633         if (register_pernet_device(&default_device_ops))
7634                 goto out;
7635
7636         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7637         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7638
7639         hotcpu_notifier(dev_cpu_callback, 0);
7640         dst_init();
7641         rc = 0;
7642 out:
7643         return rc;
7644 }
7645
7646 subsys_initcall(net_dev_init);