net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136 #include <linux/net_tstamp.h>
 137 #include <linux/static_key.h>
 138 #include <net/flow_keys.h>
 139
 140 #include "net-sysfs.h"
 141
 142 /* Instead of increasing this, you should create a hash table. */
 143 #define MAX_GRO_SKBS 8
 144
 145 /* This should be increased if a protocol with a bigger head is added. */
 146 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 147
 148 /*
 149  *      The list of packet types we will receive (as opposed to discard)
 150  *      and the routines to invoke.
 151  *
 152  *      Why 16. Because with 16 the only overlap we get on a hash of the
 153  *      low nibble of the protocol value is RARP/SNAP/X.25.
 154  *
 155  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 156  *             sure which should go first, but I bet it won't make much
 157  *             difference if we are running VLANs.  The good news is that
 158  *             this protocol won't be in the list unless compiled in, so
 159  *             the average user (w/out VLANs) will not be adversely affected.
 160  *             --BLG
 161  *
 162  *              0800    IP
 163  *              8100    802.1Q VLAN
 164  *              0001    802.3
 165  *              0002    AX.25
 166  *              0004    802.2
 167  *              8035    RARP
 168  *              0005    SNAP
 169  *              0805    X.25
 170  *              0806    ARP
 171  *              8137    IPX
 172  *              0009    Localtalk
 173  *              86DD    IPv6
 174  */
 175
 176 #define PTYPE_HASH_SIZE (16)
 177 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 178
 179 static DEFINE_SPINLOCK(ptype_lock);
 180 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181 static struct list_head ptype_all __read_mostly;        /* Taps */
 182
 183 /*
 184  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 185  * semaphore.
 186  *
 187  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 188  *
 189  * Writers must hold the rtnl semaphore while they loop through the
 190  * dev_base_head list, and hold dev_base_lock for writing when they do the
 191  * actual updates.  This allows pure readers to access the list even
 192  * while a writer is preparing to update it.
 193  *
 194  * To put it another way, dev_base_lock is held for writing only to
 195  * protect against pure readers; the rtnl semaphore provides the
 196  * protection against other writers.
 197  *
 198  * See, for example usages, register_netdevice() and
 199  * unregister_netdevice(), which must be called with the rtnl
 200  * semaphore held.
 201  */
 202 DEFINE_RWLOCK(dev_base_lock);
 203 EXPORT_SYMBOL(dev_base_lock);
 204
 205 static inline void dev_base_seq_inc(struct net *net)
 206 {
 207         while (++net->dev_base_seq == 0);
 208 }
 209
 210 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 211 {
 212         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 213         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214 }
 215
 216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217 {
 218         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219 }
 220
 221 static inline void rps_lock(struct softnet_data *sd)
 222 {
 223 #ifdef CONFIG_RPS
 224         spin_lock(&sd->input_pkt_queue.lock);
 225 #endif
 226 }
 227
 228 static inline void rps_unlock(struct softnet_data *sd)
 229 {
 230 #ifdef CONFIG_RPS
 231         spin_unlock(&sd->input_pkt_queue.lock);
 232 #endif
 233 }
 234
 235 /* Device list insertion */
 236 static int list_netdevice(struct net_device *dev)
 237 {
 238         struct net *net = dev_net(dev);
 239
 240         ASSERT_RTNL();
 241
 242         write_lock_bh(&dev_base_lock);
 243         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245         hlist_add_head_rcu(&dev->index_hlist,
 246                            dev_index_hash(net, dev->ifindex));
 247         write_unlock_bh(&dev_base_lock);
 248
 249         dev_base_seq_inc(net);
 250
 251         return 0;
 252 }
 253
 254 /* Device list removal
 255  * caller must respect a RCU grace period before freeing/reusing dev
 256  */
 257 static void unlist_netdevice(struct net_device *dev)
 258 {
 259         ASSERT_RTNL();
 260
 261         /* Unlink dev from the device chain */
 262         write_lock_bh(&dev_base_lock);
 263         list_del_rcu(&dev->dev_list);
 264         hlist_del_rcu(&dev->name_hlist);
 265         hlist_del_rcu(&dev->index_hlist);
 266         write_unlock_bh(&dev_base_lock);
 267
 268         dev_base_seq_inc(dev_net(dev));
 269 }
 270
 271 /*
 272  *      Our notifier list
 273  */
 274
 275 static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277 /*
 278  *      Device drivers call our routines to queue packets here. We empty the
 279  *      queue in the local softnet handler.
 280  */
 281
 282 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283 EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285 #ifdef CONFIG_LOCKDEP
 286 /*
 287  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288  * according to dev->type
 289  */
 290 static const unsigned short netdev_lock_type[] =
 291         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 304          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 305          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 306          ARPHRD_VOID, ARPHRD_NONE};
 307
 308 static const char *const netdev_lock_name[] =
 309         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 310          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 311          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 312          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 313          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 314          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 315          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 316          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 317          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 318          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 319          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 320          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 321          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 322          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 323          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 324          "_xmit_VOID", "_xmit_NONE"};
 325
 326 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328
 329 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 330 {
 331         int i;
 332
 333         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 334                 if (netdev_lock_type[i] == dev_type)
 335                         return i;
 336         /* the last key is used by default */
 337         return ARRAY_SIZE(netdev_lock_type) - 1;
 338 }
 339
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343         int i;
 344
 345         i = netdev_lock_pos(dev_type);
 346         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 347                                    netdev_lock_name[i]);
 348 }
 349
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352         int i;
 353
 354         i = netdev_lock_pos(dev->type);
 355         lockdep_set_class_and_name(&dev->addr_list_lock,
 356                                    &netdev_addr_lock_key[i],
 357                                    netdev_lock_name[i]);
 358 }
 359 #else
 360 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 361                                                  unsigned short dev_type)
 362 {
 363 }
 364 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 365 {
 366 }
 367 #endif
 368
 369 /*******************************************************************************
 370
 371                 Protocol management and registration routines
 372
 373 *******************************************************************************/
 374
 375 /*
 376  *      Add a protocol ID to the list. Now that the input handler is
 377  *      smarter we can dispense with all the messy stuff that used to be
 378  *      here.
 379  *
 380  *      BEWARE!!! Protocol handlers, mangling input packets,
 381  *      MUST BE last in hash buckets and checking protocol handlers
 382  *      MUST start from promiscuous ptype_all chain in net_bh.
 383  *      It is true now, do not change it.
 384  *      Explanation follows: if protocol handler, mangling packet, will
 385  *      be the first on list, it is not able to sense, that packet
 386  *      is cloned and should be copied-on-write, so that it will
 387  *      change it and subsequent readers will get broken packet.
 388  *                                                      --ANK (980803)
 389  */
 390
 391 static inline struct list_head *ptype_head(const struct packet_type *pt)
 392 {
 393         if (pt->type == htons(ETH_P_ALL))
 394                 return &ptype_all;
 395         else
 396                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 397 }
 398
 399 /**
 400  *      dev_add_pack - add packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Add a protocol handler to the networking stack. The passed &packet_type
 404  *      is linked into kernel lists and may not be freed until it has been
 405  *      removed from the kernel lists.
 406  *
 407  *      This call does not sleep therefore it can not
 408  *      guarantee all CPU's that are in middle of receiving packets
 409  *      will see the new packet type (until the next received packet).
 410  */
 411
 412 void dev_add_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415
 416         spin_lock(&ptype_lock);
 417         list_add_rcu(&pt->list, head);
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(dev_add_pack);
 421
 422 /**
 423  *      __dev_remove_pack        - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      The packet type might still be in use by receivers
 432  *      and must not be freed until after all the CPU's have gone
 433  *      through a quiescent state.
 434  */
 435 void __dev_remove_pack(struct packet_type *pt)
 436 {
 437         struct list_head *head = ptype_head(pt);
 438         struct packet_type *pt1;
 439
 440         spin_lock(&ptype_lock);
 441
 442         list_for_each_entry(pt1, head, list) {
 443                 if (pt == pt1) {
 444                         list_del_rcu(&pt->list);
 445                         goto out;
 446                 }
 447         }
 448
 449         pr_warn("dev_remove_pack: %p not found\n", pt);
 450 out:
 451         spin_unlock(&ptype_lock);
 452 }
 453 EXPORT_SYMBOL(__dev_remove_pack);
 454
 455 /**
 456  *      dev_remove_pack  - remove packet handler
 457  *      @pt: packet type declaration
 458  *
 459  *      Remove a protocol handler that was previously added to the kernel
 460  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 461  *      from the kernel lists and can be freed or reused once this function
 462  *      returns.
 463  *
 464  *      This call sleeps to guarantee that no CPU is looking at the packet
 465  *      type after return.
 466  */
 467 void dev_remove_pack(struct packet_type *pt)
 468 {
 469         __dev_remove_pack(pt);
 470
 471         synchronize_net();
 472 }
 473 EXPORT_SYMBOL(dev_remove_pack);
 474
 475 /******************************************************************************
 476
 477                       Device Boot-time Settings Routines
 478
 479 *******************************************************************************/
 480
 481 /* Boot time configuration table */
 482 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 483
 484 /**
 485  *      netdev_boot_setup_add   - add new setup entry
 486  *      @name: name of the device
 487  *      @map: configured settings for the device
 488  *
 489  *      Adds new setup entry to the dev_boot_setup list.  The function
 490  *      returns 0 on error and 1 on success.  This is a generic routine to
 491  *      all netdevices.
 492  */
 493 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 494 {
 495         struct netdev_boot_setup *s;
 496         int i;
 497
 498         s = dev_boot_setup;
 499         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 500                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 501                         memset(s[i].name, 0, sizeof(s[i].name));
 502                         strlcpy(s[i].name, name, IFNAMSIZ);
 503                         memcpy(&s[i].map, map, sizeof(s[i].map));
 504                         break;
 505                 }
 506         }
 507
 508         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 509 }
 510
 511 /**
 512  *      netdev_boot_setup_check - check boot time settings
 513  *      @dev: the netdevice
 514  *
 515  *      Check boot time settings for the device.
 516  *      The found settings are set for the device to be used
 517  *      later in the device probing.
 518  *      Returns 0 if no settings found, 1 if they are.
 519  */
 520 int netdev_boot_setup_check(struct net_device *dev)
 521 {
 522         struct netdev_boot_setup *s = dev_boot_setup;
 523         int i;
 524
 525         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 526                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 527                     !strcmp(dev->name, s[i].name)) {
 528                         dev->irq        = s[i].map.irq;
 529                         dev->base_addr  = s[i].map.base_addr;
 530                         dev->mem_start  = s[i].map.mem_start;
 531                         dev->mem_end    = s[i].map.mem_end;
 532                         return 1;
 533                 }
 534         }
 535         return 0;
 536 }
 537 EXPORT_SYMBOL(netdev_boot_setup_check);
 538
 539
 540 /**
 541  *      netdev_boot_base        - get address from boot time settings
 542  *      @prefix: prefix for network device
 543  *      @unit: id for network device
 544  *
 545  *      Check boot time settings for the base address of device.
 546  *      The found settings are set for the device to be used
 547  *      later in the device probing.
 548  *      Returns 0 if no settings found.
 549  */
 550 unsigned long netdev_boot_base(const char *prefix, int unit)
 551 {
 552         const struct netdev_boot_setup *s = dev_boot_setup;
 553         char name[IFNAMSIZ];
 554         int i;
 555
 556         sprintf(name, "%s%d", prefix, unit);
 557
 558         /*
 559          * If device already registered then return base of 1
 560          * to indicate not to probe for this interface
 561          */
 562         if (__dev_get_by_name(&init_net, name))
 563                 return 1;
 564
 565         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 566                 if (!strcmp(name, s[i].name))
 567                         return s[i].map.base_addr;
 568         return 0;
 569 }
 570
 571 /*
 572  * Saves at boot time configured settings for any netdevice.
 573  */
 574 int __init netdev_boot_setup(char *str)
 575 {
 576         int ints[5];
 577         struct ifmap map;
 578
 579         str = get_options(str, ARRAY_SIZE(ints), ints);
 580         if (!str || !*str)
 581                 return 0;
 582
 583         /* Save settings */
 584         memset(&map, 0, sizeof(map));
 585         if (ints[0] > 0)
 586                 map.irq = ints[1];
 587         if (ints[0] > 1)
 588                 map.base_addr = ints[2];
 589         if (ints[0] > 2)
 590                 map.mem_start = ints[3];
 591         if (ints[0] > 3)
 592                 map.mem_end = ints[4];
 593
 594         /* Add new entry to the list */
 595         return netdev_boot_setup_add(str, &map);
 596 }
 597
 598 __setup("netdev=", netdev_boot_setup);
 599
 600 /*******************************************************************************
 601
 602                             Device Interface Subroutines
 603
 604 *******************************************************************************/
 605
 606 /**
 607  *      __dev_get_by_name       - find a device by its name
 608  *      @net: the applicable net namespace
 609  *      @name: name to find
 610  *
 611  *      Find an interface by name. Must be called under RTNL semaphore
 612  *      or @dev_base_lock. If the name is found a pointer to the device
 613  *      is returned. If the name is not found then %NULL is returned. The
 614  *      reference counters are not incremented so the caller must be
 615  *      careful with locks.
 616  */
 617
 618 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 619 {
 620         struct hlist_node *p;
 621         struct net_device *dev;
 622         struct hlist_head *head = dev_name_hash(net, name);
 623
 624         hlist_for_each_entry(dev, p, head, name_hlist)
 625                 if (!strncmp(dev->name, name, IFNAMSIZ))
 626                         return dev;
 627
 628         return NULL;
 629 }
 630 EXPORT_SYMBOL(__dev_get_by_name);
 631
 632 /**
 633  *      dev_get_by_name_rcu     - find a device by its name
 634  *      @net: the applicable net namespace
 635  *      @name: name to find
 636  *
 637  *      Find an interface by name.
 638  *      If the name is found a pointer to the device is returned.
 639  *      If the name is not found then %NULL is returned.
 640  *      The reference counters are not incremented so the caller must be
 641  *      careful with locks. The caller must hold RCU lock.
 642  */
 643
 644 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 645 {
 646         struct hlist_node *p;
 647         struct net_device *dev;
 648         struct hlist_head *head = dev_name_hash(net, name);
 649
 650         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 651                 if (!strncmp(dev->name, name, IFNAMSIZ))
 652                         return dev;
 653
 654         return NULL;
 655 }
 656 EXPORT_SYMBOL(dev_get_by_name_rcu);
 657
 658 /**
 659  *      dev_get_by_name         - find a device by its name
 660  *      @net: the applicable net namespace
 661  *      @name: name to find
 662  *
 663  *      Find an interface by name. This can be called from any
 664  *      context and does its own locking. The returned handle has
 665  *      the usage count incremented and the caller must use dev_put() to
 666  *      release it when it is no longer needed. %NULL is returned if no
 667  *      matching device is found.
 668  */
 669
 670 struct net_device *dev_get_by_name(struct net *net, const char *name)
 671 {
 672         struct net_device *dev;
 673
 674         rcu_read_lock();
 675         dev = dev_get_by_name_rcu(net, name);
 676         if (dev)
 677                 dev_hold(dev);
 678         rcu_read_unlock();
 679         return dev;
 680 }
 681 EXPORT_SYMBOL(dev_get_by_name);
 682
 683 /**
 684  *      __dev_get_by_index - find a device by its ifindex
 685  *      @net: the applicable net namespace
 686  *      @ifindex: index of device
 687  *
 688  *      Search for an interface by index. Returns %NULL if the device
 689  *      is not found or a pointer to the device. The device has not
 690  *      had its reference counter increased so the caller must be careful
 691  *      about locking. The caller must hold either the RTNL semaphore
 692  *      or @dev_base_lock.
 693  */
 694
 695 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 696 {
 697         struct hlist_node *p;
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_index_hash(net, ifindex);
 700
 701         hlist_for_each_entry(dev, p, head, index_hlist)
 702                 if (dev->ifindex == ifindex)
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(__dev_get_by_index);
 708
 709 /**
 710  *      dev_get_by_index_rcu - find a device by its ifindex
 711  *      @net: the applicable net namespace
 712  *      @ifindex: index of device
 713  *
 714  *      Search for an interface by index. Returns %NULL if the device
 715  *      is not found or a pointer to the device. The device has not
 716  *      had its reference counter increased so the caller must be careful
 717  *      about locking. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 721 {
 722         struct hlist_node *p;
 723         struct net_device *dev;
 724         struct hlist_head *head = dev_index_hash(net, ifindex);
 725
 726         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 727                 if (dev->ifindex == ifindex)
 728                         return dev;
 729
 730         return NULL;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_index_rcu);
 733
 734
 735 /**
 736  *      dev_get_by_index - find a device by its ifindex
 737  *      @net: the applicable net namespace
 738  *      @ifindex: index of device
 739  *
 740  *      Search for an interface by index. Returns NULL if the device
 741  *      is not found or a pointer to the device. The device returned has
 742  *      had a reference added and the pointer is safe until the user calls
 743  *      dev_put to indicate they have finished with it.
 744  */
 745
 746 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 747 {
 748         struct net_device *dev;
 749
 750         rcu_read_lock();
 751         dev = dev_get_by_index_rcu(net, ifindex);
 752         if (dev)
 753                 dev_hold(dev);
 754         rcu_read_unlock();
 755         return dev;
 756 }
 757 EXPORT_SYMBOL(dev_get_by_index);
 758
 759 /**
 760  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 761  *      @net: the applicable net namespace
 762  *      @type: media type of device
 763  *      @ha: hardware address
 764  *
 765  *      Search for an interface by MAC address. Returns NULL if the device
 766  *      is not found or a pointer to the device.
 767  *      The caller must hold RCU or RTNL.
 768  *      The returned device has not had its ref count increased
 769  *      and the caller must therefore be careful about locking
 770  *
 771  */
 772
 773 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 774                                        const char *ha)
 775 {
 776         struct net_device *dev;
 777
 778         for_each_netdev_rcu(net, dev)
 779                 if (dev->type == type &&
 780                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 781                         return dev;
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 786
 787 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788 {
 789         struct net_device *dev;
 790
 791         ASSERT_RTNL();
 792         for_each_netdev(net, dev)
 793                 if (dev->type == type)
 794                         return dev;
 795
 796         return NULL;
 797 }
 798 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 799
 800 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 801 {
 802         struct net_device *dev, *ret = NULL;
 803
 804         rcu_read_lock();
 805         for_each_netdev_rcu(net, dev)
 806                 if (dev->type == type) {
 807                         dev_hold(dev);
 808                         ret = dev;
 809                         break;
 810                 }
 811         rcu_read_unlock();
 812         return ret;
 813 }
 814 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 815
 816 /**
 817  *      dev_get_by_flags_rcu - find any device with given flags
 818  *      @net: the applicable net namespace
 819  *      @if_flags: IFF_* values
 820  *      @mask: bitmask of bits in if_flags to check
 821  *
 822  *      Search for any interface with the given flags. Returns NULL if a device
 823  *      is not found or a pointer to the device. Must be called inside
 824  *      rcu_read_lock(), and result refcount is unchanged.
 825  */
 826
 827 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 828                                     unsigned short mask)
 829 {
 830         struct net_device *dev, *ret;
 831
 832         ret = NULL;
 833         for_each_netdev_rcu(net, dev) {
 834                 if (((dev->flags ^ if_flags) & mask) == 0) {
 835                         ret = dev;
 836                         break;
 837                 }
 838         }
 839         return ret;
 840 }
 841 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 842
 843 /**
 844  *      dev_valid_name - check if name is okay for network device
 845  *      @name: name string
 846  *
 847  *      Network device names need to be valid file names to
 848  *      to allow sysfs to work.  We also disallow any kind of
 849  *      whitespace.
 850  */
 851 bool dev_valid_name(const char *name)
 852 {
 853         if (*name == '\0')
 854                 return false;
 855         if (strlen(name) >= IFNAMSIZ)
 856                 return false;
 857         if (!strcmp(name, ".") || !strcmp(name, ".."))
 858                 return false;
 859
 860         while (*name) {
 861                 if (*name == '/' || isspace(*name))
 862                         return false;
 863                 name++;
 864         }
 865         return true;
 866 }
 867 EXPORT_SYMBOL(dev_valid_name);
 868
 869 /**
 870  *      __dev_alloc_name - allocate a name for a device
 871  *      @net: network namespace to allocate the device name in
 872  *      @name: name format string
 873  *      @buf:  scratch buffer and result name string
 874  *
 875  *      Passed a format string - eg "lt%d" it will try and find a suitable
 876  *      id. It scans list of devices to build up a free map, then chooses
 877  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 878  *      while allocating the name and adding the device in order to avoid
 879  *      duplicates.
 880  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 881  *      Returns the number of the unit assigned or a negative errno code.
 882  */
 883
 884 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 885 {
 886         int i = 0;
 887         const char *p;
 888         const int max_netdevices = 8*PAGE_SIZE;
 889         unsigned long *inuse;
 890         struct net_device *d;
 891
 892         p = strnchr(name, IFNAMSIZ-1, '%');
 893         if (p) {
 894                 /*
 895                  * Verify the string as this thing may have come from
 896                  * the user.  There must be either one "%d" and no other "%"
 897                  * characters.
 898                  */
 899                 if (p[1] != 'd' || strchr(p + 2, '%'))
 900                         return -EINVAL;
 901
 902                 /* Use one page as a bit array of possible slots */
 903                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 904                 if (!inuse)
 905                         return -ENOMEM;
 906
 907                 for_each_netdev(net, d) {
 908                         if (!sscanf(d->name, name, &i))
 909                                 continue;
 910                         if (i < 0 || i >= max_netdevices)
 911                                 continue;
 912
 913                         /*  avoid cases where sscanf is not exact inverse of printf */
 914                         snprintf(buf, IFNAMSIZ, name, i);
 915                         if (!strncmp(buf, d->name, IFNAMSIZ))
 916                                 set_bit(i, inuse);
 917                 }
 918
 919                 i = find_first_zero_bit(inuse, max_netdevices);
 920                 free_page((unsigned long) inuse);
 921         }
 922
 923         if (buf != name)
 924                 snprintf(buf, IFNAMSIZ, name, i);
 925         if (!__dev_get_by_name(net, buf))
 926                 return i;
 927
 928         /* It is possible to run out of possible slots
 929          * when the name is long and there isn't enough space left
 930          * for the digits, or if all bits are used.
 931          */
 932         return -ENFILE;
 933 }
 934
 935 /**
 936  *      dev_alloc_name - allocate a name for a device
 937  *      @dev: device
 938  *      @name: name format string
 939  *
 940  *      Passed a format string - eg "lt%d" it will try and find a suitable
 941  *      id. It scans list of devices to build up a free map, then chooses
 942  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 943  *      while allocating the name and adding the device in order to avoid
 944  *      duplicates.
 945  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 946  *      Returns the number of the unit assigned or a negative errno code.
 947  */
 948
 949 int dev_alloc_name(struct net_device *dev, const char *name)
 950 {
 951         char buf[IFNAMSIZ];
 952         struct net *net;
 953         int ret;
 954
 955         BUG_ON(!dev_net(dev));
 956         net = dev_net(dev);
 957         ret = __dev_alloc_name(net, name, buf);
 958         if (ret >= 0)
 959                 strlcpy(dev->name, buf, IFNAMSIZ);
 960         return ret;
 961 }
 962 EXPORT_SYMBOL(dev_alloc_name);
 963
 964 static int dev_get_valid_name(struct net_device *dev, const char *name)
 965 {
 966         struct net *net;
 967
 968         BUG_ON(!dev_net(dev));
 969         net = dev_net(dev);
 970
 971         if (!dev_valid_name(name))
 972                 return -EINVAL;
 973
 974         if (strchr(name, '%'))
 975                 return dev_alloc_name(dev, name);
 976         else if (__dev_get_by_name(net, name))
 977                 return -EEXIST;
 978         else if (dev->name != name)
 979                 strlcpy(dev->name, name, IFNAMSIZ);
 980
 981         return 0;
 982 }
 983
 984 /**
 985  *      dev_change_name - change name of a device
 986  *      @dev: device
 987  *      @newname: name (or format string) must be at least IFNAMSIZ
 988  *
 989  *      Change name of a device, can pass format strings "eth%d".
 990  *      for wildcarding.
 991  */
 992 int dev_change_name(struct net_device *dev, const char *newname)
 993 {
 994         char oldname[IFNAMSIZ];
 995         int err = 0;
 996         int ret;
 997         struct net *net;
 998
 999         ASSERT_RTNL();
1000         BUG_ON(!dev_net(dev));
1001
1002         net = dev_net(dev);
1003         if (dev->flags & IFF_UP)
1004                 return -EBUSY;
1005
1006         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1007                 return 0;
1008
1009         memcpy(oldname, dev->name, IFNAMSIZ);
1010
1011         err = dev_get_valid_name(dev, newname);
1012         if (err < 0)
1013                 return err;
1014
1015 rollback:
1016         ret = device_rename(&dev->dev, dev->name);
1017         if (ret) {
1018                 memcpy(dev->name, oldname, IFNAMSIZ);
1019                 return ret;
1020         }
1021
1022         write_lock_bh(&dev_base_lock);
1023         hlist_del_rcu(&dev->name_hlist);
1024         write_unlock_bh(&dev_base_lock);
1025
1026         synchronize_rcu();
1027
1028         write_lock_bh(&dev_base_lock);
1029         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1030         write_unlock_bh(&dev_base_lock);
1031
1032         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1033         ret = notifier_to_errno(ret);
1034
1035         if (ret) {
1036                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1037                 if (err >= 0) {
1038                         err = ret;
1039                         memcpy(dev->name, oldname, IFNAMSIZ);
1040                         goto rollback;
1041                 } else {
1042                         pr_err("%s: name change rollback failed: %d\n",
1043                                dev->name, ret);
1044                 }
1045         }
1046
1047         return err;
1048 }
1049
1050 /**
1051  *      dev_set_alias - change ifalias of a device
1052  *      @dev: device
1053  *      @alias: name up to IFALIASZ
1054  *      @len: limit of bytes to copy from info
1055  *
1056  *      Set ifalias for a device,
1057  */
1058 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1059 {
1060         ASSERT_RTNL();
1061
1062         if (len >= IFALIASZ)
1063                 return -EINVAL;
1064
1065         if (!len) {
1066                 if (dev->ifalias) {
1067                         kfree(dev->ifalias);
1068                         dev->ifalias = NULL;
1069                 }
1070                 return 0;
1071         }
1072
1073         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074         if (!dev->ifalias)
1075                 return -ENOMEM;
1076
1077         strlcpy(dev->ifalias, alias, len+1);
1078         return len;
1079 }
1080
1081
1082 /**
1083  *      netdev_features_change - device changes features
1084  *      @dev: device to cause notification
1085  *
1086  *      Called to indicate a device has changed features.
1087  */
1088 void netdev_features_change(struct net_device *dev)
1089 {
1090         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1091 }
1092 EXPORT_SYMBOL(netdev_features_change);
1093
1094 /**
1095  *      netdev_state_change - device changes state
1096  *      @dev: device to cause notification
1097  *
1098  *      Called to indicate a device has changed state. This function calls
1099  *      the notifier chains for netdev_chain and sends a NEWLINK message
1100  *      to the routing socket.
1101  */
1102 void netdev_state_change(struct net_device *dev)
1103 {
1104         if (dev->flags & IFF_UP) {
1105                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1106                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1107         }
1108 }
1109 EXPORT_SYMBOL(netdev_state_change);
1110
1111 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1112 {
1113         return call_netdevice_notifiers(event, dev);
1114 }
1115 EXPORT_SYMBOL(netdev_bonding_change);
1116
1117 /**
1118  *      dev_load        - load a network module
1119  *      @net: the applicable net namespace
1120  *      @name: name of interface
1121  *
1122  *      If a network interface is not present and the process has suitable
1123  *      privileges this function loads the module. If module loading is not
1124  *      available in this kernel then it becomes a nop.
1125  */
1126
1127 void dev_load(struct net *net, const char *name)
1128 {
1129         struct net_device *dev;
1130         int no_module;
1131
1132         rcu_read_lock();
1133         dev = dev_get_by_name_rcu(net, name);
1134         rcu_read_unlock();
1135
1136         no_module = !dev;
1137         if (no_module && capable(CAP_NET_ADMIN))
1138                 no_module = request_module("netdev-%s", name);
1139         if (no_module && capable(CAP_SYS_MODULE)) {
1140                 if (!request_module("%s", name))
1141                         pr_err("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1142                                name);
1143         }
1144 }
1145 EXPORT_SYMBOL(dev_load);
1146
1147 static int __dev_open(struct net_device *dev)
1148 {
1149         const struct net_device_ops *ops = dev->netdev_ops;
1150         int ret;
1151
1152         ASSERT_RTNL();
1153
1154         if (!netif_device_present(dev))
1155                 return -ENODEV;
1156
1157         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1158         ret = notifier_to_errno(ret);
1159         if (ret)
1160                 return ret;
1161
1162         set_bit(__LINK_STATE_START, &dev->state);
1163
1164         if (ops->ndo_validate_addr)
1165                 ret = ops->ndo_validate_addr(dev);
1166
1167         if (!ret && ops->ndo_open)
1168                 ret = ops->ndo_open(dev);
1169
1170         if (ret)
1171                 clear_bit(__LINK_STATE_START, &dev->state);
1172         else {
1173                 dev->flags |= IFF_UP;
1174                 net_dmaengine_get();
1175                 dev_set_rx_mode(dev);
1176                 dev_activate(dev);
1177         }
1178
1179         return ret;
1180 }
1181
1182 /**
1183  *      dev_open        - prepare an interface for use.
1184  *      @dev:   device to open
1185  *
1186  *      Takes a device from down to up state. The device's private open
1187  *      function is invoked and then the multicast lists are loaded. Finally
1188  *      the device is moved into the up state and a %NETDEV_UP message is
1189  *      sent to the netdev notifier chain.
1190  *
1191  *      Calling this function on an active interface is a nop. On a failure
1192  *      a negative errno code is returned.
1193  */
1194 int dev_open(struct net_device *dev)
1195 {
1196         int ret;
1197
1198         if (dev->flags & IFF_UP)
1199                 return 0;
1200
1201         ret = __dev_open(dev);
1202         if (ret < 0)
1203                 return ret;
1204
1205         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1206         call_netdevice_notifiers(NETDEV_UP, dev);
1207
1208         return ret;
1209 }
1210 EXPORT_SYMBOL(dev_open);
1211
1212 static int __dev_close_many(struct list_head *head)
1213 {
1214         struct net_device *dev;
1215
1216         ASSERT_RTNL();
1217         might_sleep();
1218
1219         list_for_each_entry(dev, head, unreg_list) {
1220                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1221
1222                 clear_bit(__LINK_STATE_START, &dev->state);
1223
1224                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1225                  * can be even on different cpu. So just clear netif_running().
1226                  *
1227                  * dev->stop() will invoke napi_disable() on all of it's
1228                  * napi_struct instances on this device.
1229                  */
1230                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1231         }
1232
1233         dev_deactivate_many(head);
1234
1235         list_for_each_entry(dev, head, unreg_list) {
1236                 const struct net_device_ops *ops = dev->netdev_ops;
1237
1238                 /*
1239                  *      Call the device specific close. This cannot fail.
1240                  *      Only if device is UP
1241                  *
1242                  *      We allow it to be called even after a DETACH hot-plug
1243                  *      event.
1244                  */
1245                 if (ops->ndo_stop)
1246                         ops->ndo_stop(dev);
1247
1248                 dev->flags &= ~IFF_UP;
1249                 net_dmaengine_put();
1250         }
1251
1252         return 0;
1253 }
1254
1255 static int __dev_close(struct net_device *dev)
1256 {
1257         int retval;
1258         LIST_HEAD(single);
1259
1260         list_add(&dev->unreg_list, &single);
1261         retval = __dev_close_many(&single);
1262         list_del(&single);
1263         return retval;
1264 }
1265
1266 static int dev_close_many(struct list_head *head)
1267 {
1268         struct net_device *dev, *tmp;
1269         LIST_HEAD(tmp_list);
1270
1271         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1272                 if (!(dev->flags & IFF_UP))
1273                         list_move(&dev->unreg_list, &tmp_list);
1274
1275         __dev_close_many(head);
1276
1277         list_for_each_entry(dev, head, unreg_list) {
1278                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1279                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1280         }
1281
1282         /* rollback_registered_many needs the complete original list */
1283         list_splice(&tmp_list, head);
1284         return 0;
1285 }
1286
1287 /**
1288  *      dev_close - shutdown an interface.
1289  *      @dev: device to shutdown
1290  *
1291  *      This function moves an active device into down state. A
1292  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1293  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1294  *      chain.
1295  */
1296 int dev_close(struct net_device *dev)
1297 {
1298         if (dev->flags & IFF_UP) {
1299                 LIST_HEAD(single);
1300
1301                 list_add(&dev->unreg_list, &single);
1302                 dev_close_many(&single);
1303                 list_del(&single);
1304         }
1305         return 0;
1306 }
1307 EXPORT_SYMBOL(dev_close);
1308
1309
1310 /**
1311  *      dev_disable_lro - disable Large Receive Offload on a device
1312  *      @dev: device
1313  *
1314  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1315  *      called under RTNL.  This is needed if received packets may be
1316  *      forwarded to another interface.
1317  */
1318 void dev_disable_lro(struct net_device *dev)
1319 {
1320         /*
1321          * If we're trying to disable lro on a vlan device
1322          * use the underlying physical device instead
1323          */
1324         if (is_vlan_dev(dev))
1325                 dev = vlan_dev_real_dev(dev);
1326
1327         dev->wanted_features &= ~NETIF_F_LRO;
1328         netdev_update_features(dev);
1329
1330         if (unlikely(dev->features & NETIF_F_LRO))
1331                 netdev_WARN(dev, "failed to disable LRO!\n");
1332 }
1333 EXPORT_SYMBOL(dev_disable_lro);
1334
1335
1336 static int dev_boot_phase = 1;
1337
1338 /**
1339  *      register_netdevice_notifier - register a network notifier block
1340  *      @nb: notifier
1341  *
1342  *      Register a notifier to be called when network device events occur.
1343  *      The notifier passed is linked into the kernel structures and must
1344  *      not be reused until it has been unregistered. A negative errno code
1345  *      is returned on a failure.
1346  *
1347  *      When registered all registration and up events are replayed
1348  *      to the new notifier to allow device to have a race free
1349  *      view of the network device list.
1350  */
1351
1352 int register_netdevice_notifier(struct notifier_block *nb)
1353 {
1354         struct net_device *dev;
1355         struct net_device *last;
1356         struct net *net;
1357         int err;
1358
1359         rtnl_lock();
1360         err = raw_notifier_chain_register(&netdev_chain, nb);
1361         if (err)
1362                 goto unlock;
1363         if (dev_boot_phase)
1364                 goto unlock;
1365         for_each_net(net) {
1366                 for_each_netdev(net, dev) {
1367                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1368                         err = notifier_to_errno(err);
1369                         if (err)
1370                                 goto rollback;
1371
1372                         if (!(dev->flags & IFF_UP))
1373                                 continue;
1374
1375                         nb->notifier_call(nb, NETDEV_UP, dev);
1376                 }
1377         }
1378
1379 unlock:
1380         rtnl_unlock();
1381         return err;
1382
1383 rollback:
1384         last = dev;
1385         for_each_net(net) {
1386                 for_each_netdev(net, dev) {
1387                         if (dev == last)
1388                                 goto outroll;
1389
1390                         if (dev->flags & IFF_UP) {
1391                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1392                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1393                         }
1394                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1395                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1396                 }
1397         }
1398
1399 outroll:
1400         raw_notifier_chain_unregister(&netdev_chain, nb);
1401         goto unlock;
1402 }
1403 EXPORT_SYMBOL(register_netdevice_notifier);
1404
1405 /**
1406  *      unregister_netdevice_notifier - unregister a network notifier block
1407  *      @nb: notifier
1408  *
1409  *      Unregister a notifier previously registered by
1410  *      register_netdevice_notifier(). The notifier is unlinked into the
1411  *      kernel structures and may then be reused. A negative errno code
1412  *      is returned on a failure.
1413  */
1414
1415 int unregister_netdevice_notifier(struct notifier_block *nb)
1416 {
1417         int err;
1418
1419         rtnl_lock();
1420         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1421         rtnl_unlock();
1422         return err;
1423 }
1424 EXPORT_SYMBOL(unregister_netdevice_notifier);
1425
1426 /**
1427  *      call_netdevice_notifiers - call all network notifier blocks
1428  *      @val: value passed unmodified to notifier function
1429  *      @dev: net_device pointer passed unmodified to notifier function
1430  *
1431  *      Call all network notifier blocks.  Parameters and return value
1432  *      are as for raw_notifier_call_chain().
1433  */
1434
1435 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1436 {
1437         ASSERT_RTNL();
1438         return raw_notifier_call_chain(&netdev_chain, val, dev);
1439 }
1440 EXPORT_SYMBOL(call_netdevice_notifiers);
1441
1442 static struct static_key netstamp_needed __read_mostly;
1443 #ifdef HAVE_JUMP_LABEL
1444 /* We are not allowed to call static_key_slow_dec() from irq context
1445  * If net_disable_timestamp() is called from irq context, defer the
1446  * static_key_slow_dec() calls.
1447  */
1448 static atomic_t netstamp_needed_deferred;
1449 #endif
1450
1451 void net_enable_timestamp(void)
1452 {
1453 #ifdef HAVE_JUMP_LABEL
1454         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1455
1456         if (deferred) {
1457                 while (--deferred)
1458                         static_key_slow_dec(&netstamp_needed);
1459                 return;
1460         }
1461 #endif
1462         WARN_ON(in_interrupt());
1463         static_key_slow_inc(&netstamp_needed);
1464 }
1465 EXPORT_SYMBOL(net_enable_timestamp);
1466
1467 void net_disable_timestamp(void)
1468 {
1469 #ifdef HAVE_JUMP_LABEL
1470         if (in_interrupt()) {
1471                 atomic_inc(&netstamp_needed_deferred);
1472                 return;
1473         }
1474 #endif
1475         static_key_slow_dec(&netstamp_needed);
1476 }
1477 EXPORT_SYMBOL(net_disable_timestamp);
1478
1479 static inline void net_timestamp_set(struct sk_buff *skb)
1480 {
1481         skb->tstamp.tv64 = 0;
1482         if (static_key_false(&netstamp_needed))
1483                 __net_timestamp(skb);
1484 }
1485
1486 #define net_timestamp_check(COND, SKB)                  \
1487         if (static_key_false(&netstamp_needed)) {               \
1488                 if ((COND) && !(SKB)->tstamp.tv64)      \
1489                         __net_timestamp(SKB);           \
1490         }                                               \
1491
1492 static int net_hwtstamp_validate(struct ifreq *ifr)
1493 {
1494         struct hwtstamp_config cfg;
1495         enum hwtstamp_tx_types tx_type;
1496         enum hwtstamp_rx_filters rx_filter;
1497         int tx_type_valid = 0;
1498         int rx_filter_valid = 0;
1499
1500         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1501                 return -EFAULT;
1502
1503         if (cfg.flags) /* reserved for future extensions */
1504                 return -EINVAL;
1505
1506         tx_type = cfg.tx_type;
1507         rx_filter = cfg.rx_filter;
1508
1509         switch (tx_type) {
1510         case HWTSTAMP_TX_OFF:
1511         case HWTSTAMP_TX_ON:
1512         case HWTSTAMP_TX_ONESTEP_SYNC:
1513                 tx_type_valid = 1;
1514                 break;
1515         }
1516
1517         switch (rx_filter) {
1518         case HWTSTAMP_FILTER_NONE:
1519         case HWTSTAMP_FILTER_ALL:
1520         case HWTSTAMP_FILTER_SOME:
1521         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1522         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1523         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1524         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1525         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1526         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1527         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1528         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1529         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1530         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1531         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1532         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1533                 rx_filter_valid = 1;
1534                 break;
1535         }
1536
1537         if (!tx_type_valid || !rx_filter_valid)
1538                 return -ERANGE;
1539
1540         return 0;
1541 }
1542
1543 static inline bool is_skb_forwardable(struct net_device *dev,
1544                                       struct sk_buff *skb)
1545 {
1546         unsigned int len;
1547
1548         if (!(dev->flags & IFF_UP))
1549                 return false;
1550
1551         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1552         if (skb->len <= len)
1553                 return true;
1554
1555         /* if TSO is enabled, we don't care about the length as the packet
1556          * could be forwarded without being segmented before
1557          */
1558         if (skb_is_gso(skb))
1559                 return true;
1560
1561         return false;
1562 }
1563
1564 /**
1565  * dev_forward_skb - loopback an skb to another netif
1566  *
1567  * @dev: destination network device
1568  * @skb: buffer to forward
1569  *
1570  * return values:
1571  *      NET_RX_SUCCESS  (no congestion)
1572  *      NET_RX_DROP     (packet was dropped, but freed)
1573  *
1574  * dev_forward_skb can be used for injecting an skb from the
1575  * start_xmit function of one device into the receive queue
1576  * of another device.
1577  *
1578  * The receiving device may be in another namespace, so
1579  * we have to clear all information in the skb that could
1580  * impact namespace isolation.
1581  */
1582 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1583 {
1584         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1585                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1586                         atomic_long_inc(&dev->rx_dropped);
1587                         kfree_skb(skb);
1588                         return NET_RX_DROP;
1589                 }
1590         }
1591
1592         skb_orphan(skb);
1593         nf_reset(skb);
1594
1595         if (unlikely(!is_skb_forwardable(dev, skb))) {
1596                 atomic_long_inc(&dev->rx_dropped);
1597                 kfree_skb(skb);
1598                 return NET_RX_DROP;
1599         }
1600         skb->skb_iif = 0;
1601         skb_set_dev(skb, dev);
1602         skb->tstamp.tv64 = 0;
1603         skb->pkt_type = PACKET_HOST;
1604         skb->protocol = eth_type_trans(skb, dev);
1605         return netif_rx(skb);
1606 }
1607 EXPORT_SYMBOL_GPL(dev_forward_skb);
1608
1609 static inline int deliver_skb(struct sk_buff *skb,
1610                               struct packet_type *pt_prev,
1611                               struct net_device *orig_dev)
1612 {
1613         atomic_inc(&skb->users);
1614         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1615 }
1616
1617 /*
1618  *      Support routine. Sends outgoing frames to any network
1619  *      taps currently in use.
1620  */
1621
1622 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1623 {
1624         struct packet_type *ptype;
1625         struct sk_buff *skb2 = NULL;
1626         struct packet_type *pt_prev = NULL;
1627
1628         rcu_read_lock();
1629         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1630                 /* Never send packets back to the socket
1631                  * they originated from - MvS (miquels@drinkel.ow.org)
1632                  */
1633                 if ((ptype->dev == dev || !ptype->dev) &&
1634                     (ptype->af_packet_priv == NULL ||
1635                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1636                         if (pt_prev) {
1637                                 deliver_skb(skb2, pt_prev, skb->dev);
1638                                 pt_prev = ptype;
1639                                 continue;
1640                         }
1641
1642                         skb2 = skb_clone(skb, GFP_ATOMIC);
1643                         if (!skb2)
1644                                 break;
1645
1646                         net_timestamp_set(skb2);
1647
1648                         /* skb->nh should be correctly
1649                            set by sender, so that the second statement is
1650                            just protection against buggy protocols.
1651                          */
1652                         skb_reset_mac_header(skb2);
1653
1654                         if (skb_network_header(skb2) < skb2->data ||
1655                             skb2->network_header > skb2->tail) {
1656                                 if (net_ratelimit())
1657                                         pr_crit("protocol %04x is buggy, dev %s\n",
1658                                                 ntohs(skb2->protocol),
1659                                                 dev->name);
1660                                 skb_reset_network_header(skb2);
1661                         }
1662
1663                         skb2->transport_header = skb2->network_header;
1664                         skb2->pkt_type = PACKET_OUTGOING;
1665                         pt_prev = ptype;
1666                 }
1667         }
1668         if (pt_prev)
1669                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1670         rcu_read_unlock();
1671 }
1672
1673 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1674  * @dev: Network device
1675  * @txq: number of queues available
1676  *
1677  * If real_num_tx_queues is changed the tc mappings may no longer be
1678  * valid. To resolve this verify the tc mapping remains valid and if
1679  * not NULL the mapping. With no priorities mapping to this
1680  * offset/count pair it will no longer be used. In the worst case TC0
1681  * is invalid nothing can be done so disable priority mappings. If is
1682  * expected that drivers will fix this mapping if they can before
1683  * calling netif_set_real_num_tx_queues.
1684  */
1685 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1686 {
1687         int i;
1688         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1689
1690         /* If TC0 is invalidated disable TC mapping */
1691         if (tc->offset + tc->count > txq) {
1692                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1693                 dev->num_tc = 0;
1694                 return;
1695         }
1696
1697         /* Invalidated prio to tc mappings set to TC0 */
1698         for (i = 1; i < TC_BITMASK + 1; i++) {
1699                 int q = netdev_get_prio_tc_map(dev, i);
1700
1701                 tc = &dev->tc_to_txq[q];
1702                 if (tc->offset + tc->count > txq) {
1703                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1704                                 i, q);
1705                         netdev_set_prio_tc_map(dev, i, 0);
1706                 }
1707         }
1708 }
1709
1710 /*
1711  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1712  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1713  */
1714 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1715 {
1716         int rc;
1717
1718         if (txq < 1 || txq > dev->num_tx_queues)
1719                 return -EINVAL;
1720
1721         if (dev->reg_state == NETREG_REGISTERED ||
1722             dev->reg_state == NETREG_UNREGISTERING) {
1723                 ASSERT_RTNL();
1724
1725                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1726                                                   txq);
1727                 if (rc)
1728                         return rc;
1729
1730                 if (dev->num_tc)
1731                         netif_setup_tc(dev, txq);
1732
1733                 if (txq < dev->real_num_tx_queues)
1734                         qdisc_reset_all_tx_gt(dev, txq);
1735         }
1736
1737         dev->real_num_tx_queues = txq;
1738         return 0;
1739 }
1740 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1741
1742 #ifdef CONFIG_RPS
1743 /**
1744  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1745  *      @dev: Network device
1746  *      @rxq: Actual number of RX queues
1747  *
1748  *      This must be called either with the rtnl_lock held or before
1749  *      registration of the net device.  Returns 0 on success, or a
1750  *      negative error code.  If called before registration, it always
1751  *      succeeds.
1752  */
1753 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1754 {
1755         int rc;
1756
1757         if (rxq < 1 || rxq > dev->num_rx_queues)
1758                 return -EINVAL;
1759
1760         if (dev->reg_state == NETREG_REGISTERED) {
1761                 ASSERT_RTNL();
1762
1763                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1764                                                   rxq);
1765                 if (rc)
1766                         return rc;
1767         }
1768
1769         dev->real_num_rx_queues = rxq;
1770         return 0;
1771 }
1772 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1773 #endif
1774
1775 static inline void __netif_reschedule(struct Qdisc *q)
1776 {
1777         struct softnet_data *sd;
1778         unsigned long flags;
1779
1780         local_irq_save(flags);
1781         sd = &__get_cpu_var(softnet_data);
1782         q->next_sched = NULL;
1783         *sd->output_queue_tailp = q;
1784         sd->output_queue_tailp = &q->next_sched;
1785         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1786         local_irq_restore(flags);
1787 }
1788
1789 void __netif_schedule(struct Qdisc *q)
1790 {
1791         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1792                 __netif_reschedule(q);
1793 }
1794 EXPORT_SYMBOL(__netif_schedule);
1795
1796 void dev_kfree_skb_irq(struct sk_buff *skb)
1797 {
1798         if (atomic_dec_and_test(&skb->users)) {
1799                 struct softnet_data *sd;
1800                 unsigned long flags;
1801
1802                 local_irq_save(flags);
1803                 sd = &__get_cpu_var(softnet_data);
1804                 skb->next = sd->completion_queue;
1805                 sd->completion_queue = skb;
1806                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1807                 local_irq_restore(flags);
1808         }
1809 }
1810 EXPORT_SYMBOL(dev_kfree_skb_irq);
1811
1812 void dev_kfree_skb_any(struct sk_buff *skb)
1813 {
1814         if (in_irq() || irqs_disabled())
1815                 dev_kfree_skb_irq(skb);
1816         else
1817                 dev_kfree_skb(skb);
1818 }
1819 EXPORT_SYMBOL(dev_kfree_skb_any);
1820
1821
1822 /**
1823  * netif_device_detach - mark device as removed
1824  * @dev: network device
1825  *
1826  * Mark device as removed from system and therefore no longer available.
1827  */
1828 void netif_device_detach(struct net_device *dev)
1829 {
1830         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1831             netif_running(dev)) {
1832                 netif_tx_stop_all_queues(dev);
1833         }
1834 }
1835 EXPORT_SYMBOL(netif_device_detach);
1836
1837 /**
1838  * netif_device_attach - mark device as attached
1839  * @dev: network device
1840  *
1841  * Mark device as attached from system and restart if needed.
1842  */
1843 void netif_device_attach(struct net_device *dev)
1844 {
1845         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1846             netif_running(dev)) {
1847                 netif_tx_wake_all_queues(dev);
1848                 __netdev_watchdog_up(dev);
1849         }
1850 }
1851 EXPORT_SYMBOL(netif_device_attach);
1852
1853 /**
1854  * skb_dev_set -- assign a new device to a buffer
1855  * @skb: buffer for the new device
1856  * @dev: network device
1857  *
1858  * If an skb is owned by a device already, we have to reset
1859  * all data private to the namespace a device belongs to
1860  * before assigning it a new device.
1861  */
1862 #ifdef CONFIG_NET_NS
1863 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1864 {
1865         skb_dst_drop(skb);
1866         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1867                 secpath_reset(skb);
1868                 nf_reset(skb);
1869                 skb_init_secmark(skb);
1870                 skb->mark = 0;
1871                 skb->priority = 0;
1872                 skb->nf_trace = 0;
1873                 skb->ipvs_property = 0;
1874 #ifdef CONFIG_NET_SCHED
1875                 skb->tc_index = 0;
1876 #endif
1877         }
1878         skb->dev = dev;
1879 }
1880 EXPORT_SYMBOL(skb_set_dev);
1881 #endif /* CONFIG_NET_NS */
1882
1883 static void skb_warn_bad_offload(const struct sk_buff *skb)
1884 {
1885         static const netdev_features_t null_features = 0;
1886         struct net_device *dev = skb->dev;
1887         const char *driver = "";
1888
1889         if (dev && dev->dev.parent)
1890                 driver = dev_driver_string(dev->dev.parent);
1891
1892         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1893              "gso_type=%d ip_summed=%d\n",
1894              driver, dev ? &dev->features : &null_features,
1895              skb->sk ? &skb->sk->sk_route_caps : &null_features,
1896              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1897              skb_shinfo(skb)->gso_type, skb->ip_summed);
1898 }
1899
1900 /*
1901  * Invalidate hardware checksum when packet is to be mangled, and
1902  * complete checksum manually on outgoing path.
1903  */
1904 int skb_checksum_help(struct sk_buff *skb)
1905 {
1906         __wsum csum;
1907         int ret = 0, offset;
1908
1909         if (skb->ip_summed == CHECKSUM_COMPLETE)
1910                 goto out_set_summed;
1911
1912         if (unlikely(skb_shinfo(skb)->gso_size)) {
1913                 skb_warn_bad_offload(skb);
1914                 return -EINVAL;
1915         }
1916
1917         offset = skb_checksum_start_offset(skb);
1918         BUG_ON(offset >= skb_headlen(skb));
1919         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1920
1921         offset += skb->csum_offset;
1922         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1923
1924         if (skb_cloned(skb) &&
1925             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1926                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1927                 if (ret)
1928                         goto out;
1929         }
1930
1931         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1932 out_set_summed:
1933         skb->ip_summed = CHECKSUM_NONE;
1934 out:
1935         return ret;
1936 }
1937 EXPORT_SYMBOL(skb_checksum_help);
1938
1939 /**
1940  *      skb_gso_segment - Perform segmentation on skb.
1941  *      @skb: buffer to segment
1942  *      @features: features for the output path (see dev->features)
1943  *
1944  *      This function segments the given skb and returns a list of segments.
1945  *
1946  *      It may return NULL if the skb requires no segmentation.  This is
1947  *      only possible when GSO is used for verifying header integrity.
1948  */
1949 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1950         netdev_features_t features)
1951 {
1952         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1953         struct packet_type *ptype;
1954         __be16 type = skb->protocol;
1955         int vlan_depth = ETH_HLEN;
1956         int err;
1957
1958         while (type == htons(ETH_P_8021Q)) {
1959                 struct vlan_hdr *vh;
1960
1961                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1962                         return ERR_PTR(-EINVAL);
1963
1964                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1965                 type = vh->h_vlan_encapsulated_proto;
1966                 vlan_depth += VLAN_HLEN;
1967         }
1968
1969         skb_reset_mac_header(skb);
1970         skb->mac_len = skb->network_header - skb->mac_header;
1971         __skb_pull(skb, skb->mac_len);
1972
1973         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1974                 skb_warn_bad_offload(skb);
1975
1976                 if (skb_header_cloned(skb) &&
1977                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1978                         return ERR_PTR(err);
1979         }
1980
1981         rcu_read_lock();
1982         list_for_each_entry_rcu(ptype,
1983                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1984                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1985                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1986                                 err = ptype->gso_send_check(skb);
1987                                 segs = ERR_PTR(err);
1988                                 if (err || skb_gso_ok(skb, features))
1989                                         break;
1990                                 __skb_push(skb, (skb->data -
1991                                                  skb_network_header(skb)));
1992                         }
1993                         segs = ptype->gso_segment(skb, features);
1994                         break;
1995                 }
1996         }
1997         rcu_read_unlock();
1998
1999         __skb_push(skb, skb->data - skb_mac_header(skb));
2000
2001         return segs;
2002 }
2003 EXPORT_SYMBOL(skb_gso_segment);
2004
2005 /* Take action when hardware reception checksum errors are detected. */
2006 #ifdef CONFIG_BUG
2007 void netdev_rx_csum_fault(struct net_device *dev)
2008 {
2009         if (net_ratelimit()) {
2010                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2011                 dump_stack();
2012         }
2013 }
2014 EXPORT_SYMBOL(netdev_rx_csum_fault);
2015 #endif
2016
2017 /* Actually, we should eliminate this check as soon as we know, that:
2018  * 1. IOMMU is present and allows to map all the memory.
2019  * 2. No high memory really exists on this machine.
2020  */
2021
2022 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2023 {
2024 #ifdef CONFIG_HIGHMEM
2025         int i;
2026         if (!(dev->features & NETIF_F_HIGHDMA)) {
2027                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2028                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2029                         if (PageHighMem(skb_frag_page(frag)))
2030                                 return 1;
2031                 }
2032         }
2033
2034         if (PCI_DMA_BUS_IS_PHYS) {
2035                 struct device *pdev = dev->dev.parent;
2036
2037                 if (!pdev)
2038                         return 0;
2039                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2040                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2041                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2042                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2043                                 return 1;
2044                 }
2045         }
2046 #endif
2047         return 0;
2048 }
2049
2050 struct dev_gso_cb {
2051         void (*destructor)(struct sk_buff *skb);
2052 };
2053
2054 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2055
2056 static void dev_gso_skb_destructor(struct sk_buff *skb)
2057 {
2058         struct dev_gso_cb *cb;
2059
2060         do {
2061                 struct sk_buff *nskb = skb->next;
2062
2063                 skb->next = nskb->next;
2064                 nskb->next = NULL;
2065                 kfree_skb(nskb);
2066         } while (skb->next);
2067
2068         cb = DEV_GSO_CB(skb);
2069         if (cb->destructor)
2070                 cb->destructor(skb);
2071 }
2072
2073 /**
2074  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2075  *      @skb: buffer to segment
2076  *      @features: device features as applicable to this skb
2077  *
2078  *      This function segments the given skb and stores the list of segments
2079  *      in skb->next.
2080  */
2081 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2082 {
2083         struct sk_buff *segs;
2084
2085         segs = skb_gso_segment(skb, features);
2086
2087         /* Verifying header integrity only. */
2088         if (!segs)
2089                 return 0;
2090
2091         if (IS_ERR(segs))
2092                 return PTR_ERR(segs);
2093
2094         skb->next = segs;
2095         DEV_GSO_CB(skb)->destructor = skb->destructor;
2096         skb->destructor = dev_gso_skb_destructor;
2097
2098         return 0;
2099 }
2100
2101 /*
2102  * Try to orphan skb early, right before transmission by the device.
2103  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2104  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2105  */
2106 static inline void skb_orphan_try(struct sk_buff *skb)
2107 {
2108         struct sock *sk = skb->sk;
2109
2110         if (sk && !skb_shinfo(skb)->tx_flags) {
2111                 /* skb_tx_hash() wont be able to get sk.
2112                  * We copy sk_hash into skb->rxhash
2113                  */
2114                 if (!skb->rxhash)
2115                         skb->rxhash = sk->sk_hash;
2116                 skb_orphan(skb);
2117         }
2118 }
2119
2120 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2121 {
2122         return ((features & NETIF_F_GEN_CSUM) ||
2123                 ((features & NETIF_F_V4_CSUM) &&
2124                  protocol == htons(ETH_P_IP)) ||
2125                 ((features & NETIF_F_V6_CSUM) &&
2126                  protocol == htons(ETH_P_IPV6)) ||
2127                 ((features & NETIF_F_FCOE_CRC) &&
2128                  protocol == htons(ETH_P_FCOE)));
2129 }
2130
2131 static netdev_features_t harmonize_features(struct sk_buff *skb,
2132         __be16 protocol, netdev_features_t features)
2133 {
2134         if (!can_checksum_protocol(features, protocol)) {
2135                 features &= ~NETIF_F_ALL_CSUM;
2136                 features &= ~NETIF_F_SG;
2137         } else if (illegal_highdma(skb->dev, skb)) {
2138                 features &= ~NETIF_F_SG;
2139         }
2140
2141         return features;
2142 }
2143
2144 netdev_features_t netif_skb_features(struct sk_buff *skb)
2145 {
2146         __be16 protocol = skb->protocol;
2147         netdev_features_t features = skb->dev->features;
2148
2149         if (protocol == htons(ETH_P_8021Q)) {
2150                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2151                 protocol = veh->h_vlan_encapsulated_proto;
2152         } else if (!vlan_tx_tag_present(skb)) {
2153                 return harmonize_features(skb, protocol, features);
2154         }
2155
2156         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2157
2158         if (protocol != htons(ETH_P_8021Q)) {
2159                 return harmonize_features(skb, protocol, features);
2160         } else {
2161                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2162                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2163                 return harmonize_features(skb, protocol, features);
2164         }
2165 }
2166 EXPORT_SYMBOL(netif_skb_features);
2167
2168 /*
2169  * Returns true if either:
2170  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2171  *      2. skb is fragmented and the device does not support SG, or if
2172  *         at least one of fragments is in highmem and device does not
2173  *         support DMA from it.
2174  */
2175 static inline int skb_needs_linearize(struct sk_buff *skb,
2176                                       int features)
2177 {
2178         return skb_is_nonlinear(skb) &&
2179                         ((skb_has_frag_list(skb) &&
2180                                 !(features & NETIF_F_FRAGLIST)) ||
2181                         (skb_shinfo(skb)->nr_frags &&
2182                                 !(features & NETIF_F_SG)));
2183 }
2184
2185 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2186                         struct netdev_queue *txq)
2187 {
2188         const struct net_device_ops *ops = dev->netdev_ops;
2189         int rc = NETDEV_TX_OK;
2190         unsigned int skb_len;
2191
2192         if (likely(!skb->next)) {
2193                 netdev_features_t features;
2194
2195                 /*
2196                  * If device doesn't need skb->dst, release it right now while
2197                  * its hot in this cpu cache
2198                  */
2199                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2200                         skb_dst_drop(skb);
2201
2202                 if (!list_empty(&ptype_all))
2203                         dev_queue_xmit_nit(skb, dev);
2204
2205                 skb_orphan_try(skb);
2206
2207                 features = netif_skb_features(skb);
2208
2209                 if (vlan_tx_tag_present(skb) &&
2210                     !(features & NETIF_F_HW_VLAN_TX)) {
2211                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2212                         if (unlikely(!skb))
2213                                 goto out;
2214
2215                         skb->vlan_tci = 0;
2216                 }
2217
2218                 if (netif_needs_gso(skb, features)) {
2219                         if (unlikely(dev_gso_segment(skb, features)))
2220                                 goto out_kfree_skb;
2221                         if (skb->next)
2222                                 goto gso;
2223                 } else {
2224                         if (skb_needs_linearize(skb, features) &&
2225                             __skb_linearize(skb))
2226                                 goto out_kfree_skb;
2227
2228                         /* If packet is not checksummed and device does not
2229                          * support checksumming for this protocol, complete
2230                          * checksumming here.
2231                          */
2232                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2233                                 skb_set_transport_header(skb,
2234                                         skb_checksum_start_offset(skb));
2235                                 if (!(features & NETIF_F_ALL_CSUM) &&
2236                                      skb_checksum_help(skb))
2237                                         goto out_kfree_skb;
2238                         }
2239                 }
2240
2241                 skb_len = skb->len;
2242                 rc = ops->ndo_start_xmit(skb, dev);
2243                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2244                 if (rc == NETDEV_TX_OK)
2245                         txq_trans_update(txq);
2246                 return rc;
2247         }
2248
2249 gso:
2250         do {
2251                 struct sk_buff *nskb = skb->next;
2252
2253                 skb->next = nskb->next;
2254                 nskb->next = NULL;
2255
2256                 /*
2257                  * If device doesn't need nskb->dst, release it right now while
2258                  * its hot in this cpu cache
2259                  */
2260                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2261                         skb_dst_drop(nskb);
2262
2263                 skb_len = nskb->len;
2264                 rc = ops->ndo_start_xmit(nskb, dev);
2265                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2266                 if (unlikely(rc != NETDEV_TX_OK)) {
2267                         if (rc & ~NETDEV_TX_MASK)
2268                                 goto out_kfree_gso_skb;
2269                         nskb->next = skb->next;
2270                         skb->next = nskb;
2271                         return rc;
2272                 }
2273                 txq_trans_update(txq);
2274                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2275                         return NETDEV_TX_BUSY;
2276         } while (skb->next);
2277
2278 out_kfree_gso_skb:
2279         if (likely(skb->next == NULL))
2280                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2281 out_kfree_skb:
2282         kfree_skb(skb);
2283 out:
2284         return rc;
2285 }
2286
2287 static u32 hashrnd __read_mostly;
2288
2289 /*
2290  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2291  * to be used as a distribution range.
2292  */
2293 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2294                   unsigned int num_tx_queues)
2295 {
2296         u32 hash;
2297         u16 qoffset = 0;
2298         u16 qcount = num_tx_queues;
2299
2300         if (skb_rx_queue_recorded(skb)) {
2301                 hash = skb_get_rx_queue(skb);
2302                 while (unlikely(hash >= num_tx_queues))
2303                         hash -= num_tx_queues;
2304                 return hash;
2305         }
2306
2307         if (dev->num_tc) {
2308                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2309                 qoffset = dev->tc_to_txq[tc].offset;
2310                 qcount = dev->tc_to_txq[tc].count;
2311         }
2312
2313         if (skb->sk && skb->sk->sk_hash)
2314                 hash = skb->sk->sk_hash;
2315         else
2316                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2317         hash = jhash_1word(hash, hashrnd);
2318
2319         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2320 }
2321 EXPORT_SYMBOL(__skb_tx_hash);
2322
2323 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2324 {
2325         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2326                 if (net_ratelimit()) {
2327                         pr_warn("%s selects TX queue %d, but real number of TX queues is %d\n",
2328                                 dev->name, queue_index,
2329                                 dev->real_num_tx_queues);
2330                 }
2331                 return 0;
2332         }
2333         return queue_index;
2334 }
2335
2336 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2337 {
2338 #ifdef CONFIG_XPS
2339         struct xps_dev_maps *dev_maps;
2340         struct xps_map *map;
2341         int queue_index = -1;
2342
2343         rcu_read_lock();
2344         dev_maps = rcu_dereference(dev->xps_maps);
2345         if (dev_maps) {
2346                 map = rcu_dereference(
2347                     dev_maps->cpu_map[raw_smp_processor_id()]);
2348                 if (map) {
2349                         if (map->len == 1)
2350                                 queue_index = map->queues[0];
2351                         else {
2352                                 u32 hash;
2353                                 if (skb->sk && skb->sk->sk_hash)
2354                                         hash = skb->sk->sk_hash;
2355                                 else
2356                                         hash = (__force u16) skb->protocol ^
2357                                             skb->rxhash;
2358                                 hash = jhash_1word(hash, hashrnd);
2359                                 queue_index = map->queues[
2360                                     ((u64)hash * map->len) >> 32];
2361                         }
2362                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2363                                 queue_index = -1;
2364                 }
2365         }
2366         rcu_read_unlock();
2367
2368         return queue_index;
2369 #else
2370         return -1;
2371 #endif
2372 }
2373
2374 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2375                                         struct sk_buff *skb)
2376 {
2377         int queue_index;
2378         const struct net_device_ops *ops = dev->netdev_ops;
2379
2380         if (dev->real_num_tx_queues == 1)
2381                 queue_index = 0;
2382         else if (ops->ndo_select_queue) {
2383                 queue_index = ops->ndo_select_queue(dev, skb);
2384                 queue_index = dev_cap_txqueue(dev, queue_index);
2385         } else {
2386                 struct sock *sk = skb->sk;
2387                 queue_index = sk_tx_queue_get(sk);
2388
2389                 if (queue_index < 0 || skb->ooo_okay ||
2390                     queue_index >= dev->real_num_tx_queues) {
2391                         int old_index = queue_index;
2392
2393                         queue_index = get_xps_queue(dev, skb);
2394                         if (queue_index < 0)
2395                                 queue_index = skb_tx_hash(dev, skb);
2396
2397                         if (queue_index != old_index && sk) {
2398                                 struct dst_entry *dst =
2399                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2400
2401                                 if (dst && skb_dst(skb) == dst)
2402                                         sk_tx_queue_set(sk, queue_index);
2403                         }
2404                 }
2405         }
2406
2407         skb_set_queue_mapping(skb, queue_index);
2408         return netdev_get_tx_queue(dev, queue_index);
2409 }
2410
2411 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2412                                  struct net_device *dev,
2413                                  struct netdev_queue *txq)
2414 {
2415         spinlock_t *root_lock = qdisc_lock(q);
2416         bool contended;
2417         int rc;
2418
2419         qdisc_skb_cb(skb)->pkt_len = skb->len;
2420         qdisc_calculate_pkt_len(skb, q);
2421         /*
2422          * Heuristic to force contended enqueues to serialize on a
2423          * separate lock before trying to get qdisc main lock.
2424          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2425          * and dequeue packets faster.
2426          */
2427         contended = qdisc_is_running(q);
2428         if (unlikely(contended))
2429                 spin_lock(&q->busylock);
2430
2431         spin_lock(root_lock);
2432         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2433                 kfree_skb(skb);
2434                 rc = NET_XMIT_DROP;
2435         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2436                    qdisc_run_begin(q)) {
2437                 /*
2438                  * This is a work-conserving queue; there are no old skbs
2439                  * waiting to be sent out; and the qdisc is not running -
2440                  * xmit the skb directly.
2441                  */
2442                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2443                         skb_dst_force(skb);
2444
2445                 qdisc_bstats_update(q, skb);
2446
2447                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2448                         if (unlikely(contended)) {
2449                                 spin_unlock(&q->busylock);
2450                                 contended = false;
2451                         }
2452                         __qdisc_run(q);
2453                 } else
2454                         qdisc_run_end(q);
2455
2456                 rc = NET_XMIT_SUCCESS;
2457         } else {
2458                 skb_dst_force(skb);
2459                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2460                 if (qdisc_run_begin(q)) {
2461                         if (unlikely(contended)) {
2462                                 spin_unlock(&q->busylock);
2463                                 contended = false;
2464                         }
2465                         __qdisc_run(q);
2466                 }
2467         }
2468         spin_unlock(root_lock);
2469         if (unlikely(contended))
2470                 spin_unlock(&q->busylock);
2471         return rc;
2472 }
2473
2474 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2475 static void skb_update_prio(struct sk_buff *skb)
2476 {
2477         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2478
2479         if ((!skb->priority) && (skb->sk) && map)
2480                 skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2481 }
2482 #else
2483 #define skb_update_prio(skb)
2484 #endif
2485
2486 static DEFINE_PER_CPU(int, xmit_recursion);
2487 #define RECURSION_LIMIT 10
2488
2489 /**
2490  *      dev_queue_xmit - transmit a buffer
2491  *      @skb: buffer to transmit
2492  *
2493  *      Queue a buffer for transmission to a network device. The caller must
2494  *      have set the device and priority and built the buffer before calling
2495  *      this function. The function can be called from an interrupt.
2496  *
2497  *      A negative errno code is returned on a failure. A success does not
2498  *      guarantee the frame will be transmitted as it may be dropped due
2499  *      to congestion or traffic shaping.
2500  *
2501  * -----------------------------------------------------------------------------------
2502  *      I notice this method can also return errors from the queue disciplines,
2503  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2504  *      be positive.
2505  *
2506  *      Regardless of the return value, the skb is consumed, so it is currently
2507  *      difficult to retry a send to this method.  (You can bump the ref count
2508  *      before sending to hold a reference for retry if you are careful.)
2509  *
2510  *      When calling this method, interrupts MUST be enabled.  This is because
2511  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2512  *          --BLG
2513  */
2514 int dev_queue_xmit(struct sk_buff *skb)
2515 {
2516         struct net_device *dev = skb->dev;
2517         struct netdev_queue *txq;
2518         struct Qdisc *q;
2519         int rc = -ENOMEM;
2520
2521         /* Disable soft irqs for various locks below. Also
2522          * stops preemption for RCU.
2523          */
2524         rcu_read_lock_bh();
2525
2526         skb_update_prio(skb);
2527
2528         txq = dev_pick_tx(dev, skb);
2529         q = rcu_dereference_bh(txq->qdisc);
2530
2531 #ifdef CONFIG_NET_CLS_ACT
2532         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2533 #endif
2534         trace_net_dev_queue(skb);
2535         if (q->enqueue) {
2536                 rc = __dev_xmit_skb(skb, q, dev, txq);
2537                 goto out;
2538         }
2539
2540         /* The device has no queue. Common case for software devices:
2541            loopback, all the sorts of tunnels...
2542
2543            Really, it is unlikely that netif_tx_lock protection is necessary
2544            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2545            counters.)
2546            However, it is possible, that they rely on protection
2547            made by us here.
2548
2549            Check this and shot the lock. It is not prone from deadlocks.
2550            Either shot noqueue qdisc, it is even simpler 8)
2551          */
2552         if (dev->flags & IFF_UP) {
2553                 int cpu = smp_processor_id(); /* ok because BHs are off */
2554
2555                 if (txq->xmit_lock_owner != cpu) {
2556
2557                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2558                                 goto recursion_alert;
2559
2560                         HARD_TX_LOCK(dev, txq, cpu);
2561
2562                         if (!netif_xmit_stopped(txq)) {
2563                                 __this_cpu_inc(xmit_recursion);
2564                                 rc = dev_hard_start_xmit(skb, dev, txq);
2565                                 __this_cpu_dec(xmit_recursion);
2566                                 if (dev_xmit_complete(rc)) {
2567                                         HARD_TX_UNLOCK(dev, txq);
2568                                         goto out;
2569                                 }
2570                         }
2571                         HARD_TX_UNLOCK(dev, txq);
2572                         if (net_ratelimit())
2573                                 pr_crit("Virtual device %s asks to queue packet!\n",
2574                                         dev->name);
2575                 } else {
2576                         /* Recursion is detected! It is possible,
2577                          * unfortunately
2578                          */
2579 recursion_alert:
2580                         if (net_ratelimit())
2581                                 pr_crit("Dead loop on virtual device %s, fix it urgently!\n",
2582                                         dev->name);
2583                 }
2584         }
2585
2586         rc = -ENETDOWN;
2587         rcu_read_unlock_bh();
2588
2589         kfree_skb(skb);
2590         return rc;
2591 out:
2592         rcu_read_unlock_bh();
2593         return rc;
2594 }
2595 EXPORT_SYMBOL(dev_queue_xmit);
2596
2597
2598 /*=======================================================================
2599                         Receiver routines
2600   =======================================================================*/
2601
2602 int netdev_max_backlog __read_mostly = 1000;
2603 int netdev_tstamp_prequeue __read_mostly = 1;
2604 int netdev_budget __read_mostly = 300;
2605 int weight_p __read_mostly = 64;            /* old backlog weight */
2606
2607 /* Called with irq disabled */
2608 static inline void ____napi_schedule(struct softnet_data *sd,
2609                                      struct napi_struct *napi)
2610 {
2611         list_add_tail(&napi->poll_list, &sd->poll_list);
2612         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2613 }
2614
2615 /*
2616  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2617  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2618  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2619  * if hash is a canonical 4-tuple hash over transport ports.
2620  */
2621 void __skb_get_rxhash(struct sk_buff *skb)
2622 {
2623         struct flow_keys keys;
2624         u32 hash;
2625
2626         if (!skb_flow_dissect(skb, &keys))
2627                 return;
2628
2629         if (keys.ports) {
2630                 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2631                         swap(keys.port16[0], keys.port16[1]);
2632                 skb->l4_rxhash = 1;
2633         }
2634
2635         /* get a consistent hash (same value on both flow directions) */
2636         if ((__force u32)keys.dst < (__force u32)keys.src)
2637                 swap(keys.dst, keys.src);
2638
2639         hash = jhash_3words((__force u32)keys.dst,
2640                             (__force u32)keys.src,
2641                             (__force u32)keys.ports, hashrnd);
2642         if (!hash)
2643                 hash = 1;
2644
2645         skb->rxhash = hash;
2646 }
2647 EXPORT_SYMBOL(__skb_get_rxhash);
2648
2649 #ifdef CONFIG_RPS
2650
2651 /* One global table that all flow-based protocols share. */
2652 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2653 EXPORT_SYMBOL(rps_sock_flow_table);
2654
2655 struct static_key rps_needed __read_mostly;
2656
2657 static struct rps_dev_flow *
2658 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2659             struct rps_dev_flow *rflow, u16 next_cpu)
2660 {
2661         if (next_cpu != RPS_NO_CPU) {
2662 #ifdef CONFIG_RFS_ACCEL
2663                 struct netdev_rx_queue *rxqueue;
2664                 struct rps_dev_flow_table *flow_table;
2665                 struct rps_dev_flow *old_rflow;
2666                 u32 flow_id;
2667                 u16 rxq_index;
2668                 int rc;
2669
2670                 /* Should we steer this flow to a different hardware queue? */
2671                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2672                     !(dev->features & NETIF_F_NTUPLE))
2673                         goto out;
2674                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2675                 if (rxq_index == skb_get_rx_queue(skb))
2676                         goto out;
2677
2678                 rxqueue = dev->_rx + rxq_index;
2679                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2680                 if (!flow_table)
2681                         goto out;
2682                 flow_id = skb->rxhash & flow_table->mask;
2683                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2684                                                         rxq_index, flow_id);
2685                 if (rc < 0)
2686                         goto out;
2687                 old_rflow = rflow;
2688                 rflow = &flow_table->flows[flow_id];
2689                 rflow->filter = rc;
2690                 if (old_rflow->filter == rflow->filter)
2691                         old_rflow->filter = RPS_NO_FILTER;
2692         out:
2693 #endif
2694                 rflow->last_qtail =
2695                         per_cpu(softnet_data, next_cpu).input_queue_head;
2696         }
2697
2698         rflow->cpu = next_cpu;
2699         return rflow;
2700 }
2701
2702 /*
2703  * get_rps_cpu is called from netif_receive_skb and returns the target
2704  * CPU from the RPS map of the receiving queue for a given skb.
2705  * rcu_read_lock must be held on entry.
2706  */
2707 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2708                        struct rps_dev_flow **rflowp)
2709 {
2710         struct netdev_rx_queue *rxqueue;
2711         struct rps_map *map;
2712         struct rps_dev_flow_table *flow_table;
2713         struct rps_sock_flow_table *sock_flow_table;
2714         int cpu = -1;
2715         u16 tcpu;
2716
2717         if (skb_rx_queue_recorded(skb)) {
2718                 u16 index = skb_get_rx_queue(skb);
2719                 if (unlikely(index >= dev->real_num_rx_queues)) {
2720                         WARN_ONCE(dev->real_num_rx_queues > 1,
2721                                   "%s received packet on queue %u, but number "
2722                                   "of RX queues is %u\n",
2723                                   dev->name, index, dev->real_num_rx_queues);
2724                         goto done;
2725                 }
2726                 rxqueue = dev->_rx + index;
2727         } else
2728                 rxqueue = dev->_rx;
2729
2730         map = rcu_dereference(rxqueue->rps_map);
2731         if (map) {
2732                 if (map->len == 1 &&
2733                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2734                         tcpu = map->cpus[0];
2735                         if (cpu_online(tcpu))
2736                                 cpu = tcpu;
2737                         goto done;
2738                 }
2739         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2740                 goto done;
2741         }
2742
2743         skb_reset_network_header(skb);
2744         if (!skb_get_rxhash(skb))
2745                 goto done;
2746
2747         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2748         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2749         if (flow_table && sock_flow_table) {
2750                 u16 next_cpu;
2751                 struct rps_dev_flow *rflow;
2752
2753                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2754                 tcpu = rflow->cpu;
2755
2756                 next_cpu = sock_flow_table->ents[skb->rxhash &
2757                     sock_flow_table->mask];
2758
2759                 /*
2760                  * If the desired CPU (where last recvmsg was done) is
2761                  * different from current CPU (one in the rx-queue flow
2762                  * table entry), switch if one of the following holds:
2763                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2764                  *   - Current CPU is offline.
2765                  *   - The current CPU's queue tail has advanced beyond the
2766                  *     last packet that was enqueued using this table entry.
2767                  *     This guarantees that all previous packets for the flow
2768                  *     have been dequeued, thus preserving in order delivery.
2769                  */
2770                 if (unlikely(tcpu != next_cpu) &&
2771                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2772                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2773                       rflow->last_qtail)) >= 0))
2774                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2775
2776                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2777                         *rflowp = rflow;
2778                         cpu = tcpu;
2779                         goto done;
2780                 }
2781         }
2782
2783         if (map) {
2784                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2785
2786                 if (cpu_online(tcpu)) {
2787                         cpu = tcpu;
2788                         goto done;
2789                 }
2790         }
2791
2792 done:
2793         return cpu;
2794 }
2795
2796 #ifdef CONFIG_RFS_ACCEL
2797
2798 /**
2799  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2800  * @dev: Device on which the filter was set
2801  * @rxq_index: RX queue index
2802  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2803  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2804  *
2805  * Drivers that implement ndo_rx_flow_steer() should periodically call
2806  * this function for each installed filter and remove the filters for
2807  * which it returns %true.
2808  */
2809 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2810                          u32 flow_id, u16 filter_id)
2811 {
2812         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2813         struct rps_dev_flow_table *flow_table;
2814         struct rps_dev_flow *rflow;
2815         bool expire = true;
2816         int cpu;
2817
2818         rcu_read_lock();
2819         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2820         if (flow_table && flow_id <= flow_table->mask) {
2821                 rflow = &flow_table->flows[flow_id];
2822                 cpu = ACCESS_ONCE(rflow->cpu);
2823                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2824                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2825                            rflow->last_qtail) <
2826                      (int)(10 * flow_table->mask)))
2827                         expire = false;
2828         }
2829         rcu_read_unlock();
2830         return expire;
2831 }
2832 EXPORT_SYMBOL(rps_may_expire_flow);
2833
2834 #endif /* CONFIG_RFS_ACCEL */
2835
2836 /* Called from hardirq (IPI) context */
2837 static void rps_trigger_softirq(void *data)
2838 {
2839         struct softnet_data *sd = data;
2840
2841         ____napi_schedule(sd, &sd->backlog);
2842         sd->received_rps++;
2843 }
2844
2845 #endif /* CONFIG_RPS */
2846
2847 /*
2848  * Check if this softnet_data structure is another cpu one
2849  * If yes, queue it to our IPI list and return 1
2850  * If no, return 0
2851  */
2852 static int rps_ipi_queued(struct softnet_data *sd)
2853 {
2854 #ifdef CONFIG_RPS
2855         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2856
2857         if (sd != mysd) {
2858                 sd->rps_ipi_next = mysd->rps_ipi_list;
2859                 mysd->rps_ipi_list = sd;
2860
2861                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2862                 return 1;
2863         }
2864 #endif /* CONFIG_RPS */
2865         return 0;
2866 }
2867
2868 /*
2869  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2870  * queue (may be a remote CPU queue).
2871  */
2872 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2873                               unsigned int *qtail)
2874 {
2875         struct softnet_data *sd;
2876         unsigned long flags;
2877
2878         sd = &per_cpu(softnet_data, cpu);
2879
2880         local_irq_save(flags);
2881
2882         rps_lock(sd);
2883         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2884                 if (skb_queue_len(&sd->input_pkt_queue)) {
2885 enqueue:
2886                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2887                         input_queue_tail_incr_save(sd, qtail);
2888                         rps_unlock(sd);
2889                         local_irq_restore(flags);
2890                         return NET_RX_SUCCESS;
2891                 }
2892
2893                 /* Schedule NAPI for backlog device
2894                  * We can use non atomic operation since we own the queue lock
2895                  */
2896                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2897                         if (!rps_ipi_queued(sd))
2898                                 ____napi_schedule(sd, &sd->backlog);
2899                 }
2900                 goto enqueue;
2901         }
2902
2903         sd->dropped++;
2904         rps_unlock(sd);
2905
2906         local_irq_restore(flags);
2907
2908         atomic_long_inc(&skb->dev->rx_dropped);
2909         kfree_skb(skb);
2910         return NET_RX_DROP;
2911 }
2912
2913 /**
2914  *      netif_rx        -       post buffer to the network code
2915  *      @skb: buffer to post
2916  *
2917  *      This function receives a packet from a device driver and queues it for
2918  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2919  *      may be dropped during processing for congestion control or by the
2920  *      protocol layers.
2921  *
2922  *      return values:
2923  *      NET_RX_SUCCESS  (no congestion)
2924  *      NET_RX_DROP     (packet was dropped)
2925  *
2926  */
2927
2928 int netif_rx(struct sk_buff *skb)
2929 {
2930         int ret;
2931
2932         /* if netpoll wants it, pretend we never saw it */
2933         if (netpoll_rx(skb))
2934                 return NET_RX_DROP;
2935
2936         net_timestamp_check(netdev_tstamp_prequeue, skb);
2937
2938         trace_netif_rx(skb);
2939 #ifdef CONFIG_RPS
2940         if (static_key_false(&rps_needed)) {
2941                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2942                 int cpu;
2943
2944                 preempt_disable();
2945                 rcu_read_lock();
2946
2947                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2948                 if (cpu < 0)
2949                         cpu = smp_processor_id();
2950
2951                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2952
2953                 rcu_read_unlock();
2954                 preempt_enable();
2955         } else
2956 #endif
2957         {
2958                 unsigned int qtail;
2959                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2960                 put_cpu();
2961         }
2962         return ret;
2963 }
2964 EXPORT_SYMBOL(netif_rx);
2965
2966 int netif_rx_ni(struct sk_buff *skb)
2967 {
2968         int err;
2969
2970         preempt_disable();
2971         err = netif_rx(skb);
2972         if (local_softirq_pending())
2973                 do_softirq();
2974         preempt_enable();
2975
2976         return err;
2977 }
2978 EXPORT_SYMBOL(netif_rx_ni);
2979
2980 static void net_tx_action(struct softirq_action *h)
2981 {
2982         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2983
2984         if (sd->completion_queue) {
2985                 struct sk_buff *clist;
2986
2987                 local_irq_disable();
2988                 clist = sd->completion_queue;
2989                 sd->completion_queue = NULL;
2990                 local_irq_enable();
2991
2992                 while (clist) {
2993                         struct sk_buff *skb = clist;
2994                         clist = clist->next;
2995
2996                         WARN_ON(atomic_read(&skb->users));
2997                         trace_kfree_skb(skb, net_tx_action);
2998                         __kfree_skb(skb);
2999                 }
3000         }
3001
3002         if (sd->output_queue) {
3003                 struct Qdisc *head;
3004
3005                 local_irq_disable();
3006                 head = sd->output_queue;
3007                 sd->output_queue = NULL;
3008                 sd->output_queue_tailp = &sd->output_queue;
3009                 local_irq_enable();
3010
3011                 while (head) {
3012                         struct Qdisc *q = head;
3013                         spinlock_t *root_lock;
3014
3015                         head = head->next_sched;
3016
3017                         root_lock = qdisc_lock(q);
3018                         if (spin_trylock(root_lock)) {
3019                                 smp_mb__before_clear_bit();
3020                                 clear_bit(__QDISC_STATE_SCHED,
3021                                           &q->state);
3022                                 qdisc_run(q);
3023                                 spin_unlock(root_lock);
3024                         } else {
3025                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3026                                               &q->state)) {
3027                                         __netif_reschedule(q);
3028                                 } else {
3029                                         smp_mb__before_clear_bit();
3030                                         clear_bit(__QDISC_STATE_SCHED,
3031                                                   &q->state);
3032                                 }
3033                         }
3034                 }
3035         }
3036 }
3037
3038 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3039     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3040 /* This hook is defined here for ATM LANE */
3041 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3042                              unsigned char *addr) __read_mostly;
3043 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3044 #endif
3045
3046 #ifdef CONFIG_NET_CLS_ACT
3047 /* TODO: Maybe we should just force sch_ingress to be compiled in
3048  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3049  * a compare and 2 stores extra right now if we dont have it on
3050  * but have CONFIG_NET_CLS_ACT
3051  * NOTE: This doesn't stop any functionality; if you dont have
3052  * the ingress scheduler, you just can't add policies on ingress.
3053  *
3054  */
3055 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3056 {
3057         struct net_device *dev = skb->dev;
3058         u32 ttl = G_TC_RTTL(skb->tc_verd);
3059         int result = TC_ACT_OK;
3060         struct Qdisc *q;
3061
3062         if (unlikely(MAX_RED_LOOP < ttl++)) {
3063                 if (net_ratelimit())
3064                         pr_warn("Redir loop detected Dropping packet (%d->%d)\n",
3065                                 skb->skb_iif, dev->ifindex);
3066                 return TC_ACT_SHOT;
3067         }
3068
3069         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3070         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3071
3072         q = rxq->qdisc;
3073         if (q != &noop_qdisc) {
3074                 spin_lock(qdisc_lock(q));
3075                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3076                         result = qdisc_enqueue_root(skb, q);
3077                 spin_unlock(qdisc_lock(q));
3078         }
3079
3080         return result;
3081 }
3082
3083 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3084                                          struct packet_type **pt_prev,
3085                                          int *ret, struct net_device *orig_dev)
3086 {
3087         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3088
3089         if (!rxq || rxq->qdisc == &noop_qdisc)
3090                 goto out;
3091
3092         if (*pt_prev) {
3093                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3094                 *pt_prev = NULL;
3095         }
3096
3097         switch (ing_filter(skb, rxq)) {
3098         case TC_ACT_SHOT:
3099         case TC_ACT_STOLEN:
3100                 kfree_skb(skb);
3101                 return NULL;
3102         }
3103
3104 out:
3105         skb->tc_verd = 0;
3106         return skb;
3107 }
3108 #endif
3109
3110 /**
3111  *      netdev_rx_handler_register - register receive handler
3112  *      @dev: device to register a handler for
3113  *      @rx_handler: receive handler to register
3114  *      @rx_handler_data: data pointer that is used by rx handler
3115  *
3116  *      Register a receive hander for a device. This handler will then be
3117  *      called from __netif_receive_skb. A negative errno code is returned
3118  *      on a failure.
3119  *
3120  *      The caller must hold the rtnl_mutex.
3121  *
3122  *      For a general description of rx_handler, see enum rx_handler_result.
3123  */
3124 int netdev_rx_handler_register(struct net_device *dev,
3125                                rx_handler_func_t *rx_handler,
3126                                void *rx_handler_data)
3127 {
3128         ASSERT_RTNL();
3129
3130         if (dev->rx_handler)
3131                 return -EBUSY;
3132
3133         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3134         rcu_assign_pointer(dev->rx_handler, rx_handler);
3135
3136         return 0;
3137 }
3138 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3139
3140 /**
3141  *      netdev_rx_handler_unregister - unregister receive handler
3142  *      @dev: device to unregister a handler from
3143  *
3144  *      Unregister a receive hander from a device.
3145  *
3146  *      The caller must hold the rtnl_mutex.
3147  */
3148 void netdev_rx_handler_unregister(struct net_device *dev)
3149 {
3150
3151         ASSERT_RTNL();
3152         RCU_INIT_POINTER(dev->rx_handler, NULL);
3153         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3154 }
3155 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3156
3157 static int __netif_receive_skb(struct sk_buff *skb)
3158 {
3159         struct packet_type *ptype, *pt_prev;
3160         rx_handler_func_t *rx_handler;
3161         struct net_device *orig_dev;
3162         struct net_device *null_or_dev;
3163         bool deliver_exact = false;
3164         int ret = NET_RX_DROP;
3165         __be16 type;
3166
3167         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3168
3169         trace_netif_receive_skb(skb);
3170
3171         /* if we've gotten here through NAPI, check netpoll */
3172         if (netpoll_receive_skb(skb))
3173                 return NET_RX_DROP;
3174
3175         if (!skb->skb_iif)
3176                 skb->skb_iif = skb->dev->ifindex;
3177         orig_dev = skb->dev;
3178
3179         skb_reset_network_header(skb);
3180         skb_reset_transport_header(skb);
3181         skb_reset_mac_len(skb);
3182
3183         pt_prev = NULL;
3184
3185         rcu_read_lock();
3186
3187 another_round:
3188
3189         __this_cpu_inc(softnet_data.processed);
3190
3191         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3192                 skb = vlan_untag(skb);
3193                 if (unlikely(!skb))
3194                         goto out;
3195         }
3196
3197 #ifdef CONFIG_NET_CLS_ACT
3198         if (skb->tc_verd & TC_NCLS) {
3199                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3200                 goto ncls;
3201         }
3202 #endif
3203
3204         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3205                 if (!ptype->dev || ptype->dev == skb->dev) {
3206                         if (pt_prev)
3207                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3208                         pt_prev = ptype;
3209                 }
3210         }
3211
3212 #ifdef CONFIG_NET_CLS_ACT
3213         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3214         if (!skb)
3215                 goto out;
3216 ncls:
3217 #endif
3218
3219         rx_handler = rcu_dereference(skb->dev->rx_handler);
3220         if (vlan_tx_tag_present(skb)) {
3221                 if (pt_prev) {
3222                         ret = deliver_skb(skb, pt_prev, orig_dev);
3223                         pt_prev = NULL;
3224                 }
3225                 if (vlan_do_receive(&skb, !rx_handler))
3226                         goto another_round;
3227                 else if (unlikely(!skb))
3228                         goto out;
3229         }
3230
3231         if (rx_handler) {
3232                 if (pt_prev) {
3233                         ret = deliver_skb(skb, pt_prev, orig_dev);
3234                         pt_prev = NULL;
3235                 }
3236                 switch (rx_handler(&skb)) {
3237                 case RX_HANDLER_CONSUMED:
3238                         goto out;
3239                 case RX_HANDLER_ANOTHER:
3240                         goto another_round;
3241                 case RX_HANDLER_EXACT:
3242                         deliver_exact = true;
3243                 case RX_HANDLER_PASS:
3244                         break;
3245                 default:
3246                         BUG();
3247                 }
3248         }
3249
3250         /* deliver only exact match when indicated */
3251         null_or_dev = deliver_exact ? skb->dev : NULL;
3252
3253         type = skb->protocol;
3254         list_for_each_entry_rcu(ptype,
3255                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3256                 if (ptype->type == type &&
3257                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3258                      ptype->dev == orig_dev)) {
3259                         if (pt_prev)
3260                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3261                         pt_prev = ptype;
3262                 }
3263         }
3264
3265         if (pt_prev) {
3266                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3267         } else {
3268                 atomic_long_inc(&skb->dev->rx_dropped);
3269                 kfree_skb(skb);
3270                 /* Jamal, now you will not able to escape explaining
3271                  * me how you were going to use this. :-)
3272                  */
3273                 ret = NET_RX_DROP;
3274         }
3275
3276 out:
3277         rcu_read_unlock();
3278         return ret;
3279 }
3280
3281 /**
3282  *      netif_receive_skb - process receive buffer from network
3283  *      @skb: buffer to process
3284  *
3285  *      netif_receive_skb() is the main receive data processing function.
3286  *      It always succeeds. The buffer may be dropped during processing
3287  *      for congestion control or by the protocol layers.
3288  *
3289  *      This function may only be called from softirq context and interrupts
3290  *      should be enabled.
3291  *
3292  *      Return values (usually ignored):
3293  *      NET_RX_SUCCESS: no congestion
3294  *      NET_RX_DROP: packet was dropped
3295  */
3296 int netif_receive_skb(struct sk_buff *skb)
3297 {
3298         net_timestamp_check(netdev_tstamp_prequeue, skb);
3299
3300         if (skb_defer_rx_timestamp(skb))
3301                 return NET_RX_SUCCESS;
3302
3303 #ifdef CONFIG_RPS
3304         if (static_key_false(&rps_needed)) {
3305                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3306                 int cpu, ret;
3307
3308                 rcu_read_lock();
3309
3310                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3311
3312                 if (cpu >= 0) {
3313                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3314                         rcu_read_unlock();
3315                         return ret;
3316                 }
3317                 rcu_read_unlock();
3318         }
3319 #endif
3320         return __netif_receive_skb(skb);
3321 }
3322 EXPORT_SYMBOL(netif_receive_skb);
3323
3324 /* Network device is going away, flush any packets still pending
3325  * Called with irqs disabled.
3326  */
3327 static void flush_backlog(void *arg)
3328 {
3329         struct net_device *dev = arg;
3330         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3331         struct sk_buff *skb, *tmp;
3332
3333         rps_lock(sd);
3334         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3335                 if (skb->dev == dev) {
3336                         __skb_unlink(skb, &sd->input_pkt_queue);
3337                         kfree_skb(skb);
3338                         input_queue_head_incr(sd);
3339                 }
3340         }
3341         rps_unlock(sd);
3342
3343         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3344                 if (skb->dev == dev) {
3345                         __skb_unlink(skb, &sd->process_queue);
3346                         kfree_skb(skb);
3347                         input_queue_head_incr(sd);
3348                 }
3349         }
3350 }
3351
3352 static int napi_gro_complete(struct sk_buff *skb)
3353 {
3354         struct packet_type *ptype;
3355         __be16 type = skb->protocol;
3356         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3357         int err = -ENOENT;
3358
3359         if (NAPI_GRO_CB(skb)->count == 1) {
3360                 skb_shinfo(skb)->gso_size = 0;
3361                 goto out;
3362         }
3363
3364         rcu_read_lock();
3365         list_for_each_entry_rcu(ptype, head, list) {
3366                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3367                         continue;
3368
3369                 err = ptype->gro_complete(skb);
3370                 break;
3371         }
3372         rcu_read_unlock();
3373
3374         if (err) {
3375                 WARN_ON(&ptype->list == head);
3376                 kfree_skb(skb);
3377                 return NET_RX_SUCCESS;
3378         }
3379
3380 out:
3381         return netif_receive_skb(skb);
3382 }
3383
3384 inline void napi_gro_flush(struct napi_struct *napi)
3385 {
3386         struct sk_buff *skb, *next;
3387
3388         for (skb = napi->gro_list; skb; skb = next) {
3389                 next = skb->next;
3390                 skb->next = NULL;
3391                 napi_gro_complete(skb);
3392         }
3393
3394         napi->gro_count = 0;
3395         napi->gro_list = NULL;
3396 }
3397 EXPORT_SYMBOL(napi_gro_flush);
3398
3399 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3400 {
3401         struct sk_buff **pp = NULL;
3402         struct packet_type *ptype;
3403         __be16 type = skb->protocol;
3404         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3405         int same_flow;
3406         int mac_len;
3407         enum gro_result ret;
3408
3409         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3410                 goto normal;
3411
3412         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3413                 goto normal;
3414
3415         rcu_read_lock();
3416         list_for_each_entry_rcu(ptype, head, list) {
3417                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3418                         continue;
3419
3420                 skb_set_network_header(skb, skb_gro_offset(skb));
3421                 mac_len = skb->network_header - skb->mac_header;
3422                 skb->mac_len = mac_len;
3423                 NAPI_GRO_CB(skb)->same_flow = 0;
3424                 NAPI_GRO_CB(skb)->flush = 0;
3425                 NAPI_GRO_CB(skb)->free = 0;
3426
3427                 pp = ptype->gro_receive(&napi->gro_list, skb);
3428                 break;
3429         }
3430         rcu_read_unlock();
3431
3432         if (&ptype->list == head)
3433                 goto normal;
3434
3435         same_flow = NAPI_GRO_CB(skb)->same_flow;
3436         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3437
3438         if (pp) {
3439                 struct sk_buff *nskb = *pp;
3440
3441                 *pp = nskb->next;
3442                 nskb->next = NULL;
3443                 napi_gro_complete(nskb);
3444                 napi->gro_count--;
3445         }
3446
3447         if (same_flow)
3448                 goto ok;
3449
3450         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3451                 goto normal;
3452
3453         napi->gro_count++;
3454         NAPI_GRO_CB(skb)->count = 1;
3455         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3456         skb->next = napi->gro_list;
3457         napi->gro_list = skb;
3458         ret = GRO_HELD;
3459
3460 pull:
3461         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3462                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3463
3464                 BUG_ON(skb->end - skb->tail < grow);
3465
3466                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3467
3468                 skb->tail += grow;
3469                 skb->data_len -= grow;
3470
3471                 skb_shinfo(skb)->frags[0].page_offset += grow;
3472                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3473
3474                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3475                         skb_frag_unref(skb, 0);
3476                         memmove(skb_shinfo(skb)->frags,
3477                                 skb_shinfo(skb)->frags + 1,
3478                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3479                 }
3480         }
3481
3482 ok:
3483         return ret;
3484
3485 normal:
3486         ret = GRO_NORMAL;
3487         goto pull;
3488 }
3489 EXPORT_SYMBOL(dev_gro_receive);
3490
3491 static inline gro_result_t
3492 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3493 {
3494         struct sk_buff *p;
3495         unsigned int maclen = skb->dev->hard_header_len;
3496
3497         for (p = napi->gro_list; p; p = p->next) {
3498                 unsigned long diffs;
3499
3500                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3501                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3502                 if (maclen == ETH_HLEN)
3503                         diffs |= compare_ether_header(skb_mac_header(p),
3504                                                       skb_gro_mac_header(skb));
3505                 else if (!diffs)
3506                         diffs = memcmp(skb_mac_header(p),
3507                                        skb_gro_mac_header(skb),
3508                                        maclen);
3509                 NAPI_GRO_CB(p)->same_flow = !diffs;
3510                 NAPI_GRO_CB(p)->flush = 0;
3511         }
3512
3513         return dev_gro_receive(napi, skb);
3514 }
3515
3516 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3517 {
3518         switch (ret) {
3519         case GRO_NORMAL:
3520                 if (netif_receive_skb(skb))
3521                         ret = GRO_DROP;
3522                 break;
3523
3524         case GRO_DROP:
3525         case GRO_MERGED_FREE:
3526                 kfree_skb(skb);
3527                 break;
3528
3529         case GRO_HELD:
3530         case GRO_MERGED:
3531                 break;
3532         }
3533
3534         return ret;
3535 }
3536 EXPORT_SYMBOL(napi_skb_finish);
3537
3538 void skb_gro_reset_offset(struct sk_buff *skb)
3539 {
3540         NAPI_GRO_CB(skb)->data_offset = 0;
3541         NAPI_GRO_CB(skb)->frag0 = NULL;
3542         NAPI_GRO_CB(skb)->frag0_len = 0;
3543
3544         if (skb->mac_header == skb->tail &&
3545             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3546                 NAPI_GRO_CB(skb)->frag0 =
3547                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3548                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3549         }
3550 }
3551 EXPORT_SYMBOL(skb_gro_reset_offset);
3552
3553 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3554 {
3555         skb_gro_reset_offset(skb);
3556
3557         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3558 }
3559 EXPORT_SYMBOL(napi_gro_receive);
3560
3561 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3562 {
3563         __skb_pull(skb, skb_headlen(skb));
3564         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3565         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3566         skb->vlan_tci = 0;
3567         skb->dev = napi->dev;
3568         skb->skb_iif = 0;
3569
3570         napi->skb = skb;
3571 }
3572
3573 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3574 {
3575         struct sk_buff *skb = napi->skb;
3576
3577         if (!skb) {
3578                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3579                 if (skb)
3580                         napi->skb = skb;
3581         }
3582         return skb;
3583 }
3584 EXPORT_SYMBOL(napi_get_frags);
3585
3586 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3587                                gro_result_t ret)
3588 {
3589         switch (ret) {
3590         case GRO_NORMAL:
3591         case GRO_HELD:
3592                 skb->protocol = eth_type_trans(skb, skb->dev);
3593
3594                 if (ret == GRO_HELD)
3595                         skb_gro_pull(skb, -ETH_HLEN);
3596                 else if (netif_receive_skb(skb))
3597                         ret = GRO_DROP;
3598                 break;
3599
3600         case GRO_DROP:
3601         case GRO_MERGED_FREE:
3602                 napi_reuse_skb(napi, skb);
3603                 break;
3604
3605         case GRO_MERGED:
3606                 break;
3607         }
3608
3609         return ret;
3610 }
3611 EXPORT_SYMBOL(napi_frags_finish);
3612
3613 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3614 {
3615         struct sk_buff *skb = napi->skb;
3616         struct ethhdr *eth;
3617         unsigned int hlen;
3618         unsigned int off;
3619
3620         napi->skb = NULL;
3621
3622         skb_reset_mac_header(skb);
3623         skb_gro_reset_offset(skb);
3624
3625         off = skb_gro_offset(skb);
3626         hlen = off + sizeof(*eth);
3627         eth = skb_gro_header_fast(skb, off);
3628         if (skb_gro_header_hard(skb, hlen)) {
3629                 eth = skb_gro_header_slow(skb, hlen, off);
3630                 if (unlikely(!eth)) {
3631                         napi_reuse_skb(napi, skb);
3632                         skb = NULL;
3633                         goto out;
3634                 }
3635         }
3636
3637         skb_gro_pull(skb, sizeof(*eth));
3638
3639         /*
3640          * This works because the only protocols we care about don't require
3641          * special handling.  We'll fix it up properly at the end.
3642          */
3643         skb->protocol = eth->h_proto;
3644
3645 out:
3646         return skb;
3647 }
3648 EXPORT_SYMBOL(napi_frags_skb);
3649
3650 gro_result_t napi_gro_frags(struct napi_struct *napi)
3651 {
3652         struct sk_buff *skb = napi_frags_skb(napi);
3653
3654         if (!skb)
3655                 return GRO_DROP;
3656
3657         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3658 }
3659 EXPORT_SYMBOL(napi_gro_frags);
3660
3661 /*
3662  * net_rps_action sends any pending IPI's for rps.
3663  * Note: called with local irq disabled, but exits with local irq enabled.
3664  */
3665 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3666 {
3667 #ifdef CONFIG_RPS
3668         struct softnet_data *remsd = sd->rps_ipi_list;
3669
3670         if (remsd) {
3671                 sd->rps_ipi_list = NULL;
3672
3673                 local_irq_enable();
3674
3675                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3676                 while (remsd) {
3677                         struct softnet_data *next = remsd->rps_ipi_next;
3678
3679                         if (cpu_online(remsd->cpu))
3680                                 __smp_call_function_single(remsd->cpu,
3681                                                            &remsd->csd, 0);
3682                         remsd = next;
3683                 }
3684         } else
3685 #endif
3686                 local_irq_enable();
3687 }
3688
3689 static int process_backlog(struct napi_struct *napi, int quota)
3690 {
3691         int work = 0;
3692         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3693
3694 #ifdef CONFIG_RPS
3695         /* Check if we have pending ipi, its better to send them now,
3696          * not waiting net_rx_action() end.
3697          */
3698         if (sd->rps_ipi_list) {
3699                 local_irq_disable();
3700                 net_rps_action_and_irq_enable(sd);
3701         }
3702 #endif
3703         napi->weight = weight_p;
3704         local_irq_disable();
3705         while (work < quota) {
3706                 struct sk_buff *skb;
3707                 unsigned int qlen;
3708
3709                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3710                         local_irq_enable();
3711                         __netif_receive_skb(skb);
3712                         local_irq_disable();
3713                         input_queue_head_incr(sd);
3714                         if (++work >= quota) {
3715                                 local_irq_enable();
3716                                 return work;
3717                         }
3718                 }
3719
3720                 rps_lock(sd);
3721                 qlen = skb_queue_len(&sd->input_pkt_queue);
3722                 if (qlen)
3723                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3724                                                    &sd->process_queue);
3725
3726                 if (qlen < quota - work) {
3727                         /*
3728                          * Inline a custom version of __napi_complete().
3729                          * only current cpu owns and manipulates this napi,
3730                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3731                          * we can use a plain write instead of clear_bit(),
3732                          * and we dont need an smp_mb() memory barrier.
3733                          */
3734                         list_del(&napi->poll_list);
3735                         napi->state = 0;
3736
3737                         quota = work + qlen;
3738                 }
3739                 rps_unlock(sd);
3740         }
3741         local_irq_enable();
3742
3743         return work;
3744 }
3745
3746 /**
3747  * __napi_schedule - schedule for receive
3748  * @n: entry to schedule
3749  *
3750  * The entry's receive function will be scheduled to run
3751  */
3752 void __napi_schedule(struct napi_struct *n)
3753 {
3754         unsigned long flags;
3755
3756         local_irq_save(flags);
3757         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3758         local_irq_restore(flags);
3759 }
3760 EXPORT_SYMBOL(__napi_schedule);
3761
3762 void __napi_complete(struct napi_struct *n)
3763 {
3764         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3765         BUG_ON(n->gro_list);
3766
3767         list_del(&n->poll_list);
3768         smp_mb__before_clear_bit();
3769         clear_bit(NAPI_STATE_SCHED, &n->state);
3770 }
3771 EXPORT_SYMBOL(__napi_complete);
3772
3773 void napi_complete(struct napi_struct *n)
3774 {
3775         unsigned long flags;
3776
3777         /*
3778          * don't let napi dequeue from the cpu poll list
3779          * just in case its running on a different cpu
3780          */
3781         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3782                 return;
3783
3784         napi_gro_flush(n);
3785         local_irq_save(flags);
3786         __napi_complete(n);
3787         local_irq_restore(flags);
3788 }
3789 EXPORT_SYMBOL(napi_complete);
3790
3791 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3792                     int (*poll)(struct napi_struct *, int), int weight)
3793 {
3794         INIT_LIST_HEAD(&napi->poll_list);
3795         napi->gro_count = 0;
3796         napi->gro_list = NULL;
3797         napi->skb = NULL;
3798         napi->poll = poll;
3799         napi->weight = weight;
3800         list_add(&napi->dev_list, &dev->napi_list);
3801         napi->dev = dev;
3802 #ifdef CONFIG_NETPOLL
3803         spin_lock_init(&napi->poll_lock);
3804         napi->poll_owner = -1;
3805 #endif
3806         set_bit(NAPI_STATE_SCHED, &napi->state);
3807 }
3808 EXPORT_SYMBOL(netif_napi_add);
3809
3810 void netif_napi_del(struct napi_struct *napi)
3811 {
3812         struct sk_buff *skb, *next;
3813
3814         list_del_init(&napi->dev_list);
3815         napi_free_frags(napi);
3816
3817         for (skb = napi->gro_list; skb; skb = next) {
3818                 next = skb->next;
3819                 skb->next = NULL;
3820                 kfree_skb(skb);
3821         }
3822
3823         napi->gro_list = NULL;
3824         napi->gro_count = 0;
3825 }
3826 EXPORT_SYMBOL(netif_napi_del);
3827
3828 static void net_rx_action(struct softirq_action *h)
3829 {
3830         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3831         unsigned long time_limit = jiffies + 2;
3832         int budget = netdev_budget;
3833         void *have;
3834
3835         local_irq_disable();
3836
3837         while (!list_empty(&sd->poll_list)) {
3838                 struct napi_struct *n;
3839                 int work, weight;
3840
3841                 /* If softirq window is exhuasted then punt.
3842                  * Allow this to run for 2 jiffies since which will allow
3843                  * an average latency of 1.5/HZ.
3844                  */
3845                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3846                         goto softnet_break;
3847
3848                 local_irq_enable();
3849
3850                 /* Even though interrupts have been re-enabled, this
3851                  * access is safe because interrupts can only add new
3852                  * entries to the tail of this list, and only ->poll()
3853                  * calls can remove this head entry from the list.
3854                  */
3855                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3856
3857                 have = netpoll_poll_lock(n);
3858
3859                 weight = n->weight;
3860
3861                 /* This NAPI_STATE_SCHED test is for avoiding a race
3862                  * with netpoll's poll_napi().  Only the entity which
3863                  * obtains the lock and sees NAPI_STATE_SCHED set will
3864                  * actually make the ->poll() call.  Therefore we avoid
3865                  * accidentally calling ->poll() when NAPI is not scheduled.
3866                  */
3867                 work = 0;
3868                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3869                         work = n->poll(n, weight);
3870                         trace_napi_poll(n);
3871                 }
3872
3873                 WARN_ON_ONCE(work > weight);
3874
3875                 budget -= work;
3876
3877                 local_irq_disable();
3878
3879                 /* Drivers must not modify the NAPI state if they
3880                  * consume the entire weight.  In such cases this code
3881                  * still "owns" the NAPI instance and therefore can
3882                  * move the instance around on the list at-will.
3883                  */
3884                 if (unlikely(work == weight)) {
3885                         if (unlikely(napi_disable_pending(n))) {
3886                                 local_irq_enable();
3887                                 napi_complete(n);
3888                                 local_irq_disable();
3889                         } else
3890                                 list_move_tail(&n->poll_list, &sd->poll_list);
3891                 }
3892
3893                 netpoll_poll_unlock(have);
3894         }
3895 out:
3896         net_rps_action_and_irq_enable(sd);
3897
3898 #ifdef CONFIG_NET_DMA
3899         /*
3900          * There may not be any more sk_buffs coming right now, so push
3901          * any pending DMA copies to hardware
3902          */
3903         dma_issue_pending_all();
3904 #endif
3905
3906         return;
3907
3908 softnet_break:
3909         sd->time_squeeze++;
3910         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3911         goto out;
3912 }
3913
3914 static gifconf_func_t *gifconf_list[NPROTO];
3915
3916 /**
3917  *      register_gifconf        -       register a SIOCGIF handler
3918  *      @family: Address family
3919  *      @gifconf: Function handler
3920  *
3921  *      Register protocol dependent address dumping routines. The handler
3922  *      that is passed must not be freed or reused until it has been replaced
3923  *      by another handler.
3924  */
3925 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3926 {
3927         if (family >= NPROTO)
3928                 return -EINVAL;
3929         gifconf_list[family] = gifconf;
3930         return 0;
3931 }
3932 EXPORT_SYMBOL(register_gifconf);
3933
3934
3935 /*
3936  *      Map an interface index to its name (SIOCGIFNAME)
3937  */
3938
3939 /*
3940  *      We need this ioctl for efficient implementation of the
3941  *      if_indextoname() function required by the IPv6 API.  Without
3942  *      it, we would have to search all the interfaces to find a
3943  *      match.  --pb
3944  */
3945
3946 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3947 {
3948         struct net_device *dev;
3949         struct ifreq ifr;
3950
3951         /*
3952          *      Fetch the caller's info block.
3953          */
3954
3955         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3956                 return -EFAULT;
3957
3958         rcu_read_lock();
3959         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3960         if (!dev) {
3961                 rcu_read_unlock();
3962                 return -ENODEV;
3963         }
3964
3965         strcpy(ifr.ifr_name, dev->name);
3966         rcu_read_unlock();
3967
3968         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3969                 return -EFAULT;
3970         return 0;
3971 }
3972
3973 /*
3974  *      Perform a SIOCGIFCONF call. This structure will change
3975  *      size eventually, and there is nothing I can do about it.
3976  *      Thus we will need a 'compatibility mode'.
3977  */
3978
3979 static int dev_ifconf(struct net *net, char __user *arg)
3980 {
3981         struct ifconf ifc;
3982         struct net_device *dev;
3983         char __user *pos;
3984         int len;
3985         int total;
3986         int i;
3987
3988         /*
3989          *      Fetch the caller's info block.
3990          */
3991
3992         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3993                 return -EFAULT;
3994
3995         pos = ifc.ifc_buf;
3996         len = ifc.ifc_len;
3997
3998         /*
3999          *      Loop over the interfaces, and write an info block for each.
4000          */
4001
4002         total = 0;
4003         for_each_netdev(net, dev) {
4004                 for (i = 0; i < NPROTO; i++) {
4005                         if (gifconf_list[i]) {
4006                                 int done;
4007                                 if (!pos)
4008                                         done = gifconf_list[i](dev, NULL, 0);
4009                                 else
4010                                         done = gifconf_list[i](dev, pos + total,
4011                                                                len - total);
4012                                 if (done < 0)
4013                                         return -EFAULT;
4014                                 total += done;
4015                         }
4016                 }
4017         }
4018
4019         /*
4020          *      All done.  Write the updated control block back to the caller.
4021          */
4022         ifc.ifc_len = total;
4023
4024         /*
4025          *      Both BSD and Solaris return 0 here, so we do too.
4026          */
4027         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4028 }
4029
4030 #ifdef CONFIG_PROC_FS
4031
4032 #define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4033
4034 struct dev_iter_state {
4035         struct seq_net_private p;
4036         unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4037 };
4038
4039 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4040 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4041 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4042
4043 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4044 {
4045         struct dev_iter_state *state = seq->private;
4046         struct net *net = seq_file_net(seq);
4047         struct net_device *dev;
4048         struct hlist_node *p;
4049         struct hlist_head *h;
4050         unsigned int count, bucket, offset;
4051
4052         bucket = get_bucket(state->pos);
4053         offset = get_offset(state->pos);
4054         h = &net->dev_name_head[bucket];
4055         count = 0;
4056         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4057                 if (count++ == offset) {
4058                         state->pos = set_bucket_offset(bucket, count);
4059                         return dev;
4060                 }
4061         }
4062
4063         return NULL;
4064 }
4065
4066 static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4067 {
4068         struct dev_iter_state *state = seq->private;
4069         struct net_device *dev;
4070         unsigned int bucket;
4071
4072         bucket = get_bucket(state->pos);
4073         do {
4074                 dev = dev_from_same_bucket(seq);
4075                 if (dev)
4076                         return dev;
4077
4078                 bucket++;
4079                 state->pos = set_bucket_offset(bucket, 0);
4080         } while (bucket < NETDEV_HASHENTRIES);
4081
4082         return NULL;
4083 }
4084
4085 /*
4086  *      This is invoked by the /proc filesystem handler to display a device
4087  *      in detail.
4088  */
4089 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4090         __acquires(RCU)
4091 {
4092         struct dev_iter_state *state = seq->private;
4093
4094         rcu_read_lock();
4095         if (!*pos)
4096                 return SEQ_START_TOKEN;
4097
4098         /* check for end of the hash */
4099         if (state->pos == 0 && *pos > 1)
4100                 return NULL;
4101
4102         return dev_from_new_bucket(seq);
4103 }
4104
4105 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4106 {
4107         struct net_device *dev;
4108
4109         ++*pos;
4110
4111         if (v == SEQ_START_TOKEN)
4112                 return dev_from_new_bucket(seq);
4113
4114         dev = dev_from_same_bucket(seq);
4115         if (dev)
4116                 return dev;
4117
4118         return dev_from_new_bucket(seq);
4119 }
4120
4121 void dev_seq_stop(struct seq_file *seq, void *v)
4122         __releases(RCU)
4123 {
4124         rcu_read_unlock();
4125 }
4126
4127 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4128 {
4129         struct rtnl_link_stats64 temp;
4130         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4131
4132         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4133                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4134                    dev->name, stats->rx_bytes, stats->rx_packets,
4135                    stats->rx_errors,
4136                    stats->rx_dropped + stats->rx_missed_errors,
4137                    stats->rx_fifo_errors,
4138                    stats->rx_length_errors + stats->rx_over_errors +
4139                     stats->rx_crc_errors + stats->rx_frame_errors,
4140                    stats->rx_compressed, stats->multicast,
4141                    stats->tx_bytes, stats->tx_packets,
4142                    stats->tx_errors, stats->tx_dropped,
4143                    stats->tx_fifo_errors, stats->collisions,
4144                    stats->tx_carrier_errors +
4145                     stats->tx_aborted_errors +
4146                     stats->tx_window_errors +
4147                     stats->tx_heartbeat_errors,
4148                    stats->tx_compressed);
4149 }
4150
4151 /*
4152  *      Called from the PROCfs module. This now uses the new arbitrary sized
4153  *      /proc/net interface to create /proc/net/dev
4154  */
4155 static int dev_seq_show(struct seq_file *seq, void *v)
4156 {
4157         if (v == SEQ_START_TOKEN)
4158                 seq_puts(seq, "Inter-|   Receive                            "
4159                               "                    |  Transmit\n"
4160                               " face |bytes    packets errs drop fifo frame "
4161                               "compressed multicast|bytes    packets errs "
4162                               "drop fifo colls carrier compressed\n");
4163         else
4164                 dev_seq_printf_stats(seq, v);
4165         return 0;
4166 }
4167
4168 static struct softnet_data *softnet_get_online(loff_t *pos)
4169 {
4170         struct softnet_data *sd = NULL;
4171
4172         while (*pos < nr_cpu_ids)
4173                 if (cpu_online(*pos)) {
4174                         sd = &per_cpu(softnet_data, *pos);
4175                         break;
4176                 } else
4177                         ++*pos;
4178         return sd;
4179 }
4180
4181 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4182 {
4183         return softnet_get_online(pos);
4184 }
4185
4186 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4187 {
4188         ++*pos;
4189         return softnet_get_online(pos);
4190 }
4191
4192 static void softnet_seq_stop(struct seq_file *seq, void *v)
4193 {
4194 }
4195
4196 static int softnet_seq_show(struct seq_file *seq, void *v)
4197 {
4198         struct softnet_data *sd = v;
4199
4200         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4201                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4202                    0, 0, 0, 0, /* was fastroute */
4203                    sd->cpu_collision, sd->received_rps);
4204         return 0;
4205 }
4206
4207 static const struct seq_operations dev_seq_ops = {
4208         .start = dev_seq_start,
4209         .next  = dev_seq_next,
4210         .stop  = dev_seq_stop,
4211         .show  = dev_seq_show,
4212 };
4213
4214 static int dev_seq_open(struct inode *inode, struct file *file)
4215 {
4216         return seq_open_net(inode, file, &dev_seq_ops,
4217                             sizeof(struct dev_iter_state));
4218 }
4219
4220 int dev_seq_open_ops(struct inode *inode, struct file *file,
4221                      const struct seq_operations *ops)
4222 {
4223         return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
4224 }
4225
4226 static const struct file_operations dev_seq_fops = {
4227         .owner   = THIS_MODULE,
4228         .open    = dev_seq_open,
4229         .read    = seq_read,
4230         .llseek  = seq_lseek,
4231         .release = seq_release_net,
4232 };
4233
4234 static const struct seq_operations softnet_seq_ops = {
4235         .start = softnet_seq_start,
4236         .next  = softnet_seq_next,
4237         .stop  = softnet_seq_stop,
4238         .show  = softnet_seq_show,
4239 };
4240
4241 static int softnet_seq_open(struct inode *inode, struct file *file)
4242 {
4243         return seq_open(file, &softnet_seq_ops);
4244 }
4245
4246 static const struct file_operations softnet_seq_fops = {
4247         .owner   = THIS_MODULE,
4248         .open    = softnet_seq_open,
4249         .read    = seq_read,
4250         .llseek  = seq_lseek,
4251         .release = seq_release,
4252 };
4253
4254 static void *ptype_get_idx(loff_t pos)
4255 {
4256         struct packet_type *pt = NULL;
4257         loff_t i = 0;
4258         int t;
4259
4260         list_for_each_entry_rcu(pt, &ptype_all, list) {
4261                 if (i == pos)
4262                         return pt;
4263                 ++i;
4264         }
4265
4266         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4267                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4268                         if (i == pos)
4269                                 return pt;
4270                         ++i;
4271                 }
4272         }
4273         return NULL;
4274 }
4275
4276 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4277         __acquires(RCU)
4278 {
4279         rcu_read_lock();
4280         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4281 }
4282
4283 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4284 {
4285         struct packet_type *pt;
4286         struct list_head *nxt;
4287         int hash;
4288
4289         ++*pos;
4290         if (v == SEQ_START_TOKEN)
4291                 return ptype_get_idx(0);
4292
4293         pt = v;
4294         nxt = pt->list.next;
4295         if (pt->type == htons(ETH_P_ALL)) {
4296                 if (nxt != &ptype_all)
4297                         goto found;
4298                 hash = 0;
4299                 nxt = ptype_base[0].next;
4300         } else
4301                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4302
4303         while (nxt == &ptype_base[hash]) {
4304                 if (++hash >= PTYPE_HASH_SIZE)
4305                         return NULL;
4306                 nxt = ptype_base[hash].next;
4307         }
4308 found:
4309         return list_entry(nxt, struct packet_type, list);
4310 }
4311
4312 static void ptype_seq_stop(struct seq_file *seq, void *v)
4313         __releases(RCU)
4314 {
4315         rcu_read_unlock();
4316 }
4317
4318 static int ptype_seq_show(struct seq_file *seq, void *v)
4319 {
4320         struct packet_type *pt = v;
4321
4322         if (v == SEQ_START_TOKEN)
4323                 seq_puts(seq, "Type Device      Function\n");
4324         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4325                 if (pt->type == htons(ETH_P_ALL))
4326                         seq_puts(seq, "ALL ");
4327                 else
4328                         seq_printf(seq, "%04x", ntohs(pt->type));
4329
4330                 seq_printf(seq, " %-8s %pF\n",
4331                            pt->dev ? pt->dev->name : "", pt->func);
4332         }
4333
4334         return 0;
4335 }
4336
4337 static const struct seq_operations ptype_seq_ops = {
4338         .start = ptype_seq_start,
4339         .next  = ptype_seq_next,
4340         .stop  = ptype_seq_stop,
4341         .show  = ptype_seq_show,
4342 };
4343
4344 static int ptype_seq_open(struct inode *inode, struct file *file)
4345 {
4346         return seq_open_net(inode, file, &ptype_seq_ops,
4347                         sizeof(struct seq_net_private));
4348 }
4349
4350 static const struct file_operations ptype_seq_fops = {
4351         .owner   = THIS_MODULE,
4352         .open    = ptype_seq_open,
4353         .read    = seq_read,
4354         .llseek  = seq_lseek,
4355         .release = seq_release_net,
4356 };
4357
4358
4359 static int __net_init dev_proc_net_init(struct net *net)
4360 {
4361         int rc = -ENOMEM;
4362
4363         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4364                 goto out;
4365         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4366                 goto out_dev;
4367         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4368                 goto out_softnet;
4369
4370         if (wext_proc_init(net))
4371                 goto out_ptype;
4372         rc = 0;
4373 out:
4374         return rc;
4375 out_ptype:
4376         proc_net_remove(net, "ptype");
4377 out_softnet:
4378         proc_net_remove(net, "softnet_stat");
4379 out_dev:
4380         proc_net_remove(net, "dev");
4381         goto out;
4382 }
4383
4384 static void __net_exit dev_proc_net_exit(struct net *net)
4385 {
4386         wext_proc_exit(net);
4387
4388         proc_net_remove(net, "ptype");
4389         proc_net_remove(net, "softnet_stat");
4390         proc_net_remove(net, "dev");
4391 }
4392
4393 static struct pernet_operations __net_initdata dev_proc_ops = {
4394         .init = dev_proc_net_init,
4395         .exit = dev_proc_net_exit,
4396 };
4397
4398 static int __init dev_proc_init(void)
4399 {
4400         return register_pernet_subsys(&dev_proc_ops);
4401 }
4402 #else
4403 #define dev_proc_init() 0
4404 #endif  /* CONFIG_PROC_FS */
4405
4406
4407 /**
4408  *      netdev_set_master       -       set up master pointer
4409  *      @slave: slave device
4410  *      @master: new master device
4411  *
4412  *      Changes the master device of the slave. Pass %NULL to break the
4413  *      bonding. The caller must hold the RTNL semaphore. On a failure
4414  *      a negative errno code is returned. On success the reference counts
4415  *      are adjusted and the function returns zero.
4416  */
4417 int netdev_set_master(struct net_device *slave, struct net_device *master)
4418 {
4419         struct net_device *old = slave->master;
4420
4421         ASSERT_RTNL();
4422
4423         if (master) {
4424                 if (old)
4425                         return -EBUSY;
4426                 dev_hold(master);
4427         }
4428
4429         slave->master = master;
4430
4431         if (old)
4432                 dev_put(old);
4433         return 0;
4434 }
4435 EXPORT_SYMBOL(netdev_set_master);
4436
4437 /**
4438  *      netdev_set_bond_master  -       set up bonding master/slave pair
4439  *      @slave: slave device
4440  *      @master: new master device
4441  *
4442  *      Changes the master device of the slave. Pass %NULL to break the
4443  *      bonding. The caller must hold the RTNL semaphore. On a failure
4444  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4445  *      to the routing socket and the function returns zero.
4446  */
4447 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4448 {
4449         int err;
4450
4451         ASSERT_RTNL();
4452
4453         err = netdev_set_master(slave, master);
4454         if (err)
4455                 return err;
4456         if (master)
4457                 slave->flags |= IFF_SLAVE;
4458         else
4459                 slave->flags &= ~IFF_SLAVE;
4460
4461         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4462         return 0;
4463 }
4464 EXPORT_SYMBOL(netdev_set_bond_master);
4465
4466 static void dev_change_rx_flags(struct net_device *dev, int flags)
4467 {
4468         const struct net_device_ops *ops = dev->netdev_ops;
4469
4470         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4471                 ops->ndo_change_rx_flags(dev, flags);
4472 }
4473
4474 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4475 {
4476         unsigned int old_flags = dev->flags;
4477         uid_t uid;
4478         gid_t gid;
4479
4480         ASSERT_RTNL();
4481
4482         dev->flags |= IFF_PROMISC;
4483         dev->promiscuity += inc;
4484         if (dev->promiscuity == 0) {
4485                 /*
4486                  * Avoid overflow.
4487                  * If inc causes overflow, untouch promisc and return error.
4488                  */
4489                 if (inc < 0)
4490                         dev->flags &= ~IFF_PROMISC;
4491                 else {
4492                         dev->promiscuity -= inc;
4493                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4494                                 dev->name);
4495                         return -EOVERFLOW;
4496                 }
4497         }
4498         if (dev->flags != old_flags) {
4499                 pr_info("device %s %s promiscuous mode\n",
4500                         dev->name,
4501                         dev->flags & IFF_PROMISC ? "entered" : "left");
4502                 if (audit_enabled) {
4503                         current_uid_gid(&uid, &gid);
4504                         audit_log(current->audit_context, GFP_ATOMIC,
4505                                 AUDIT_ANOM_PROMISCUOUS,
4506                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4507                                 dev->name, (dev->flags & IFF_PROMISC),
4508                                 (old_flags & IFF_PROMISC),
4509                                 audit_get_loginuid(current),
4510                                 uid, gid,
4511                                 audit_get_sessionid(current));
4512                 }
4513
4514                 dev_change_rx_flags(dev, IFF_PROMISC);
4515         }
4516         return 0;
4517 }
4518
4519 /**
4520  *      dev_set_promiscuity     - update promiscuity count on a device
4521  *      @dev: device
4522  *      @inc: modifier
4523  *
4524  *      Add or remove promiscuity from a device. While the count in the device
4525  *      remains above zero the interface remains promiscuous. Once it hits zero
4526  *      the device reverts back to normal filtering operation. A negative inc
4527  *      value is used to drop promiscuity on the device.
4528  *      Return 0 if successful or a negative errno code on error.
4529  */
4530 int dev_set_promiscuity(struct net_device *dev, int inc)
4531 {
4532         unsigned int old_flags = dev->flags;
4533         int err;
4534
4535         err = __dev_set_promiscuity(dev, inc);
4536         if (err < 0)
4537                 return err;
4538         if (dev->flags != old_flags)
4539                 dev_set_rx_mode(dev);
4540         return err;
4541 }
4542 EXPORT_SYMBOL(dev_set_promiscuity);
4543
4544 /**
4545  *      dev_set_allmulti        - update allmulti count on a device
4546  *      @dev: device
4547  *      @inc: modifier
4548  *
4549  *      Add or remove reception of all multicast frames to a device. While the
4550  *      count in the device remains above zero the interface remains listening
4551  *      to all interfaces. Once it hits zero the device reverts back to normal
4552  *      filtering operation. A negative @inc value is used to drop the counter
4553  *      when releasing a resource needing all multicasts.
4554  *      Return 0 if successful or a negative errno code on error.
4555  */
4556
4557 int dev_set_allmulti(struct net_device *dev, int inc)
4558 {
4559         unsigned int old_flags = dev->flags;
4560
4561         ASSERT_RTNL();
4562
4563         dev->flags |= IFF_ALLMULTI;
4564         dev->allmulti += inc;
4565         if (dev->allmulti == 0) {
4566                 /*
4567                  * Avoid overflow.
4568                  * If inc causes overflow, untouch allmulti and return error.
4569                  */
4570                 if (inc < 0)
4571                         dev->flags &= ~IFF_ALLMULTI;
4572                 else {
4573                         dev->allmulti -= inc;
4574                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4575                                 dev->name);
4576                         return -EOVERFLOW;
4577                 }
4578         }
4579         if (dev->flags ^ old_flags) {
4580                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4581                 dev_set_rx_mode(dev);
4582         }
4583         return 0;
4584 }
4585 EXPORT_SYMBOL(dev_set_allmulti);
4586
4587 /*
4588  *      Upload unicast and multicast address lists to device and
4589  *      configure RX filtering. When the device doesn't support unicast
4590  *      filtering it is put in promiscuous mode while unicast addresses
4591  *      are present.
4592  */
4593 void __dev_set_rx_mode(struct net_device *dev)
4594 {
4595         const struct net_device_ops *ops = dev->netdev_ops;
4596
4597         /* dev_open will call this function so the list will stay sane. */
4598         if (!(dev->flags&IFF_UP))
4599                 return;
4600
4601         if (!netif_device_present(dev))
4602                 return;
4603
4604         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4605                 /* Unicast addresses changes may only happen under the rtnl,
4606                  * therefore calling __dev_set_promiscuity here is safe.
4607                  */
4608                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4609                         __dev_set_promiscuity(dev, 1);
4610                         dev->uc_promisc = true;
4611                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4612                         __dev_set_promiscuity(dev, -1);
4613                         dev->uc_promisc = false;
4614                 }
4615         }
4616
4617         if (ops->ndo_set_rx_mode)
4618                 ops->ndo_set_rx_mode(dev);
4619 }
4620
4621 void dev_set_rx_mode(struct net_device *dev)
4622 {
4623         netif_addr_lock_bh(dev);
4624         __dev_set_rx_mode(dev);
4625         netif_addr_unlock_bh(dev);
4626 }
4627
4628 /**
4629  *      dev_get_flags - get flags reported to userspace
4630  *      @dev: device
4631  *
4632  *      Get the combination of flag bits exported through APIs to userspace.
4633  */
4634 unsigned dev_get_flags(const struct net_device *dev)
4635 {
4636         unsigned flags;
4637
4638         flags = (dev->flags & ~(IFF_PROMISC |
4639                                 IFF_ALLMULTI |
4640                                 IFF_RUNNING |
4641                                 IFF_LOWER_UP |
4642                                 IFF_DORMANT)) |
4643                 (dev->gflags & (IFF_PROMISC |
4644                                 IFF_ALLMULTI));
4645
4646         if (netif_running(dev)) {
4647                 if (netif_oper_up(dev))
4648                         flags |= IFF_RUNNING;
4649                 if (netif_carrier_ok(dev))
4650                         flags |= IFF_LOWER_UP;
4651                 if (netif_dormant(dev))
4652                         flags |= IFF_DORMANT;
4653         }
4654
4655         return flags;
4656 }
4657 EXPORT_SYMBOL(dev_get_flags);
4658
4659 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4660 {
4661         unsigned int old_flags = dev->flags;
4662         int ret;
4663
4664         ASSERT_RTNL();
4665
4666         /*
4667          *      Set the flags on our device.
4668          */
4669
4670         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4671                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4672                                IFF_AUTOMEDIA)) |
4673                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4674                                     IFF_ALLMULTI));
4675
4676         /*
4677          *      Load in the correct multicast list now the flags have changed.
4678          */
4679
4680         if ((old_flags ^ flags) & IFF_MULTICAST)
4681                 dev_change_rx_flags(dev, IFF_MULTICAST);
4682
4683         dev_set_rx_mode(dev);
4684
4685         /*
4686          *      Have we downed the interface. We handle IFF_UP ourselves
4687          *      according to user attempts to set it, rather than blindly
4688          *      setting it.
4689          */
4690
4691         ret = 0;
4692         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4693                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4694
4695                 if (!ret)
4696                         dev_set_rx_mode(dev);
4697         }
4698
4699         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4700                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4701
4702                 dev->gflags ^= IFF_PROMISC;
4703                 dev_set_promiscuity(dev, inc);
4704         }
4705
4706         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4707            is important. Some (broken) drivers set IFF_PROMISC, when
4708            IFF_ALLMULTI is requested not asking us and not reporting.
4709          */
4710         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4711                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4712
4713                 dev->gflags ^= IFF_ALLMULTI;
4714                 dev_set_allmulti(dev, inc);
4715         }
4716
4717         return ret;
4718 }
4719
4720 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4721 {
4722         unsigned int changes = dev->flags ^ old_flags;
4723
4724         if (changes & IFF_UP) {
4725                 if (dev->flags & IFF_UP)
4726                         call_netdevice_notifiers(NETDEV_UP, dev);
4727                 else
4728                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4729         }
4730
4731         if (dev->flags & IFF_UP &&
4732             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4733                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4734 }
4735
4736 /**
4737  *      dev_change_flags - change device settings
4738  *      @dev: device
4739  *      @flags: device state flags
4740  *
4741  *      Change settings on device based state flags. The flags are
4742  *      in the userspace exported format.
4743  */
4744 int dev_change_flags(struct net_device *dev, unsigned int flags)
4745 {
4746         int ret;
4747         unsigned int changes, old_flags = dev->flags;
4748
4749         ret = __dev_change_flags(dev, flags);
4750         if (ret < 0)
4751                 return ret;
4752
4753         changes = old_flags ^ dev->flags;
4754         if (changes)
4755                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4756
4757         __dev_notify_flags(dev, old_flags);
4758         return ret;
4759 }
4760 EXPORT_SYMBOL(dev_change_flags);
4761
4762 /**
4763  *      dev_set_mtu - Change maximum transfer unit
4764  *      @dev: device
4765  *      @new_mtu: new transfer unit
4766  *
4767  *      Change the maximum transfer size of the network device.
4768  */
4769 int dev_set_mtu(struct net_device *dev, int new_mtu)
4770 {
4771         const struct net_device_ops *ops = dev->netdev_ops;
4772         int err;
4773
4774         if (new_mtu == dev->mtu)
4775                 return 0;
4776
4777         /*      MTU must be positive.    */
4778         if (new_mtu < 0)
4779                 return -EINVAL;
4780
4781         if (!netif_device_present(dev))
4782                 return -ENODEV;
4783
4784         err = 0;
4785         if (ops->ndo_change_mtu)
4786                 err = ops->ndo_change_mtu(dev, new_mtu);
4787         else
4788                 dev->mtu = new_mtu;
4789
4790         if (!err && dev->flags & IFF_UP)
4791                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4792         return err;
4793 }
4794 EXPORT_SYMBOL(dev_set_mtu);
4795
4796 /**
4797  *      dev_set_group - Change group this device belongs to
4798  *      @dev: device
4799  *      @new_group: group this device should belong to
4800  */
4801 void dev_set_group(struct net_device *dev, int new_group)
4802 {
4803         dev->group = new_group;
4804 }
4805 EXPORT_SYMBOL(dev_set_group);
4806
4807 /**
4808  *      dev_set_mac_address - Change Media Access Control Address
4809  *      @dev: device
4810  *      @sa: new address
4811  *
4812  *      Change the hardware (MAC) address of the device
4813  */
4814 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4815 {
4816         const struct net_device_ops *ops = dev->netdev_ops;
4817         int err;
4818
4819         if (!ops->ndo_set_mac_address)
4820                 return -EOPNOTSUPP;
4821         if (sa->sa_family != dev->type)
4822                 return -EINVAL;
4823         if (!netif_device_present(dev))
4824                 return -ENODEV;
4825         err = ops->ndo_set_mac_address(dev, sa);
4826         if (!err)
4827                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4828         return err;
4829 }
4830 EXPORT_SYMBOL(dev_set_mac_address);
4831
4832 /*
4833  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4834  */
4835 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4836 {
4837         int err;
4838         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4839
4840         if (!dev)
4841                 return -ENODEV;
4842
4843         switch (cmd) {
4844         case SIOCGIFFLAGS:      /* Get interface flags */
4845                 ifr->ifr_flags = (short) dev_get_flags(dev);
4846                 return 0;
4847
4848         case SIOCGIFMETRIC:     /* Get the metric on the interface
4849                                    (currently unused) */
4850                 ifr->ifr_metric = 0;
4851                 return 0;
4852
4853         case SIOCGIFMTU:        /* Get the MTU of a device */
4854                 ifr->ifr_mtu = dev->mtu;
4855                 return 0;
4856
4857         case SIOCGIFHWADDR:
4858                 if (!dev->addr_len)
4859                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4860                 else
4861                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4862                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4863                 ifr->ifr_hwaddr.sa_family = dev->type;
4864                 return 0;
4865
4866         case SIOCGIFSLAVE:
4867                 err = -EINVAL;
4868                 break;
4869
4870         case SIOCGIFMAP:
4871                 ifr->ifr_map.mem_start = dev->mem_start;
4872                 ifr->ifr_map.mem_end   = dev->mem_end;
4873                 ifr->ifr_map.base_addr = dev->base_addr;
4874                 ifr->ifr_map.irq       = dev->irq;
4875                 ifr->ifr_map.dma       = dev->dma;
4876                 ifr->ifr_map.port      = dev->if_port;
4877                 return 0;
4878
4879         case SIOCGIFINDEX:
4880                 ifr->ifr_ifindex = dev->ifindex;
4881                 return 0;
4882
4883         case SIOCGIFTXQLEN:
4884                 ifr->ifr_qlen = dev->tx_queue_len;
4885                 return 0;
4886
4887         default:
4888                 /* dev_ioctl() should ensure this case
4889                  * is never reached
4890                  */
4891                 WARN_ON(1);
4892                 err = -ENOTTY;
4893                 break;
4894
4895         }
4896         return err;
4897 }
4898
4899 /*
4900  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4901  */
4902 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4903 {
4904         int err;
4905         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4906         const struct net_device_ops *ops;
4907
4908         if (!dev)
4909                 return -ENODEV;
4910
4911         ops = dev->netdev_ops;
4912
4913         switch (cmd) {
4914         case SIOCSIFFLAGS:      /* Set interface flags */
4915                 return dev_change_flags(dev, ifr->ifr_flags);
4916
4917         case SIOCSIFMETRIC:     /* Set the metric on the interface
4918                                    (currently unused) */
4919                 return -EOPNOTSUPP;
4920
4921         case SIOCSIFMTU:        /* Set the MTU of a device */
4922                 return dev_set_mtu(dev, ifr->ifr_mtu);
4923
4924         case SIOCSIFHWADDR:
4925                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4926
4927         case SIOCSIFHWBROADCAST:
4928                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4929                         return -EINVAL;
4930                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4931                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4932                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4933                 return 0;
4934
4935         case SIOCSIFMAP:
4936                 if (ops->ndo_set_config) {
4937                         if (!netif_device_present(dev))
4938                                 return -ENODEV;
4939                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4940                 }
4941                 return -EOPNOTSUPP;
4942
4943         case SIOCADDMULTI:
4944                 if (!ops->ndo_set_rx_mode ||
4945                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4946                         return -EINVAL;
4947                 if (!netif_device_present(dev))
4948                         return -ENODEV;
4949                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4950
4951         case SIOCDELMULTI:
4952                 if (!ops->ndo_set_rx_mode ||
4953                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4954                         return -EINVAL;
4955                 if (!netif_device_present(dev))
4956                         return -ENODEV;
4957                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4958
4959         case SIOCSIFTXQLEN:
4960                 if (ifr->ifr_qlen < 0)
4961                         return -EINVAL;
4962                 dev->tx_queue_len = ifr->ifr_qlen;
4963                 return 0;
4964
4965         case SIOCSIFNAME:
4966                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4967                 return dev_change_name(dev, ifr->ifr_newname);
4968
4969         case SIOCSHWTSTAMP:
4970                 err = net_hwtstamp_validate(ifr);
4971                 if (err)
4972                         return err;
4973                 /* fall through */
4974
4975         /*
4976          *      Unknown or private ioctl
4977          */
4978         default:
4979                 if ((cmd >= SIOCDEVPRIVATE &&
4980                     cmd <= SIOCDEVPRIVATE + 15) ||
4981                     cmd == SIOCBONDENSLAVE ||
4982                     cmd == SIOCBONDRELEASE ||
4983                     cmd == SIOCBONDSETHWADDR ||
4984                     cmd == SIOCBONDSLAVEINFOQUERY ||
4985                     cmd == SIOCBONDINFOQUERY ||
4986                     cmd == SIOCBONDCHANGEACTIVE ||
4987                     cmd == SIOCGMIIPHY ||
4988                     cmd == SIOCGMIIREG ||
4989                     cmd == SIOCSMIIREG ||
4990                     cmd == SIOCBRADDIF ||
4991                     cmd == SIOCBRDELIF ||
4992                     cmd == SIOCSHWTSTAMP ||
4993                     cmd == SIOCWANDEV) {
4994                         err = -EOPNOTSUPP;
4995                         if (ops->ndo_do_ioctl) {
4996                                 if (netif_device_present(dev))
4997                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4998                                 else
4999                                         err = -ENODEV;
5000                         }
5001                 } else
5002                         err = -EINVAL;
5003
5004         }
5005         return err;
5006 }
5007
5008 /*
5009  *      This function handles all "interface"-type I/O control requests. The actual
5010  *      'doing' part of this is dev_ifsioc above.
5011  */
5012
5013 /**
5014  *      dev_ioctl       -       network device ioctl
5015  *      @net: the applicable net namespace
5016  *      @cmd: command to issue
5017  *      @arg: pointer to a struct ifreq in user space
5018  *
5019  *      Issue ioctl functions to devices. This is normally called by the
5020  *      user space syscall interfaces but can sometimes be useful for
5021  *      other purposes. The return value is the return from the syscall if
5022  *      positive or a negative errno code on error.
5023  */
5024
5025 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5026 {
5027         struct ifreq ifr;
5028         int ret;
5029         char *colon;
5030
5031         /* One special case: SIOCGIFCONF takes ifconf argument
5032            and requires shared lock, because it sleeps writing
5033            to user space.
5034          */
5035
5036         if (cmd == SIOCGIFCONF) {
5037                 rtnl_lock();
5038                 ret = dev_ifconf(net, (char __user *) arg);
5039                 rtnl_unlock();
5040                 return ret;
5041         }
5042         if (cmd == SIOCGIFNAME)
5043                 return dev_ifname(net, (struct ifreq __user *)arg);
5044
5045         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5046                 return -EFAULT;
5047
5048         ifr.ifr_name[IFNAMSIZ-1] = 0;
5049
5050         colon = strchr(ifr.ifr_name, ':');
5051         if (colon)
5052                 *colon = 0;
5053
5054         /*
5055          *      See which interface the caller is talking about.
5056          */
5057
5058         switch (cmd) {
5059         /*
5060          *      These ioctl calls:
5061          *      - can be done by all.
5062          *      - atomic and do not require locking.
5063          *      - return a value
5064          */
5065         case SIOCGIFFLAGS:
5066         case SIOCGIFMETRIC:
5067         case SIOCGIFMTU:
5068         case SIOCGIFHWADDR:
5069         case SIOCGIFSLAVE:
5070         case SIOCGIFMAP:
5071         case SIOCGIFINDEX:
5072         case SIOCGIFTXQLEN:
5073                 dev_load(net, ifr.ifr_name);
5074                 rcu_read_lock();
5075                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5076                 rcu_read_unlock();
5077                 if (!ret) {
5078                         if (colon)
5079                                 *colon = ':';
5080                         if (copy_to_user(arg, &ifr,
5081                                          sizeof(struct ifreq)))
5082                                 ret = -EFAULT;
5083                 }
5084                 return ret;
5085
5086         case SIOCETHTOOL:
5087                 dev_load(net, ifr.ifr_name);
5088                 rtnl_lock();
5089                 ret = dev_ethtool(net, &ifr);
5090                 rtnl_unlock();
5091                 if (!ret) {
5092                         if (colon)
5093                                 *colon = ':';
5094                         if (copy_to_user(arg, &ifr,
5095                                          sizeof(struct ifreq)))
5096                                 ret = -EFAULT;
5097                 }
5098                 return ret;
5099
5100         /*
5101          *      These ioctl calls:
5102          *      - require superuser power.
5103          *      - require strict serialization.
5104          *      - return a value
5105          */
5106         case SIOCGMIIPHY:
5107         case SIOCGMIIREG:
5108         case SIOCSIFNAME:
5109                 if (!capable(CAP_NET_ADMIN))
5110                         return -EPERM;
5111                 dev_load(net, ifr.ifr_name);
5112                 rtnl_lock();
5113                 ret = dev_ifsioc(net, &ifr, cmd);
5114                 rtnl_unlock();
5115                 if (!ret) {
5116                         if (colon)
5117                                 *colon = ':';
5118                         if (copy_to_user(arg, &ifr,
5119                                          sizeof(struct ifreq)))
5120                                 ret = -EFAULT;
5121                 }
5122                 return ret;
5123
5124         /*
5125          *      These ioctl calls:
5126          *      - require superuser power.
5127          *      - require strict serialization.
5128          *      - do not return a value
5129          */
5130         case SIOCSIFFLAGS:
5131         case SIOCSIFMETRIC:
5132         case SIOCSIFMTU:
5133         case SIOCSIFMAP:
5134         case SIOCSIFHWADDR:
5135         case SIOCSIFSLAVE:
5136         case SIOCADDMULTI:
5137         case SIOCDELMULTI:
5138         case SIOCSIFHWBROADCAST:
5139         case SIOCSIFTXQLEN:
5140         case SIOCSMIIREG:
5141         case SIOCBONDENSLAVE:
5142         case SIOCBONDRELEASE:
5143         case SIOCBONDSETHWADDR:
5144         case SIOCBONDCHANGEACTIVE:
5145         case SIOCBRADDIF:
5146         case SIOCBRDELIF:
5147         case SIOCSHWTSTAMP:
5148                 if (!capable(CAP_NET_ADMIN))
5149                         return -EPERM;
5150                 /* fall through */
5151         case SIOCBONDSLAVEINFOQUERY:
5152         case SIOCBONDINFOQUERY:
5153                 dev_load(net, ifr.ifr_name);
5154                 rtnl_lock();
5155                 ret = dev_ifsioc(net, &ifr, cmd);
5156                 rtnl_unlock();
5157                 return ret;
5158
5159         case SIOCGIFMEM:
5160                 /* Get the per device memory space. We can add this but
5161                  * currently do not support it */
5162         case SIOCSIFMEM:
5163                 /* Set the per device memory buffer space.
5164                  * Not applicable in our case */
5165         case SIOCSIFLINK:
5166                 return -ENOTTY;
5167
5168         /*
5169          *      Unknown or private ioctl.
5170          */
5171         default:
5172                 if (cmd == SIOCWANDEV ||
5173                     (cmd >= SIOCDEVPRIVATE &&
5174                      cmd <= SIOCDEVPRIVATE + 15)) {
5175                         dev_load(net, ifr.ifr_name);
5176                         rtnl_lock();
5177                         ret = dev_ifsioc(net, &ifr, cmd);
5178                         rtnl_unlock();
5179                         if (!ret && copy_to_user(arg, &ifr,
5180                                                  sizeof(struct ifreq)))
5181                                 ret = -EFAULT;
5182                         return ret;
5183                 }
5184                 /* Take care of Wireless Extensions */
5185                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5186                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5187                 return -ENOTTY;
5188         }
5189 }
5190
5191
5192 /**
5193  *      dev_new_index   -       allocate an ifindex
5194  *      @net: the applicable net namespace
5195  *
5196  *      Returns a suitable unique value for a new device interface
5197  *      number.  The caller must hold the rtnl semaphore or the
5198  *      dev_base_lock to be sure it remains unique.
5199  */
5200 static int dev_new_index(struct net *net)
5201 {
5202         static int ifindex;
5203         for (;;) {
5204                 if (++ifindex <= 0)
5205                         ifindex = 1;
5206                 if (!__dev_get_by_index(net, ifindex))
5207                         return ifindex;
5208         }
5209 }
5210
5211 /* Delayed registration/unregisteration */
5212 static LIST_HEAD(net_todo_list);
5213
5214 static void net_set_todo(struct net_device *dev)
5215 {
5216         list_add_tail(&dev->todo_list, &net_todo_list);
5217 }
5218
5219 static void rollback_registered_many(struct list_head *head)
5220 {
5221         struct net_device *dev, *tmp;
5222
5223         BUG_ON(dev_boot_phase);
5224         ASSERT_RTNL();
5225
5226         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5227                 /* Some devices call without registering
5228                  * for initialization unwind. Remove those
5229                  * devices and proceed with the remaining.
5230                  */
5231                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5232                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5233                                  dev->name, dev);
5234
5235                         WARN_ON(1);
5236                         list_del(&dev->unreg_list);
5237                         continue;
5238                 }
5239                 dev->dismantle = true;
5240                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5241         }
5242
5243         /* If device is running, close it first. */
5244         dev_close_many(head);
5245
5246         list_for_each_entry(dev, head, unreg_list) {
5247                 /* And unlink it from device chain. */
5248                 unlist_netdevice(dev);
5249
5250                 dev->reg_state = NETREG_UNREGISTERING;
5251         }
5252
5253         synchronize_net();
5254
5255         list_for_each_entry(dev, head, unreg_list) {
5256                 /* Shutdown queueing discipline. */
5257                 dev_shutdown(dev);
5258
5259
5260                 /* Notify protocols, that we are about to destroy
5261                    this device. They should clean all the things.
5262                 */
5263                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5264
5265                 if (!dev->rtnl_link_ops ||
5266                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5267                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5268
5269                 /*
5270                  *      Flush the unicast and multicast chains
5271                  */
5272                 dev_uc_flush(dev);
5273                 dev_mc_flush(dev);
5274
5275                 if (dev->netdev_ops->ndo_uninit)
5276                         dev->netdev_ops->ndo_uninit(dev);
5277
5278                 /* Notifier chain MUST detach us from master device. */
5279                 WARN_ON(dev->master);
5280
5281                 /* Remove entries from kobject tree */
5282                 netdev_unregister_kobject(dev);
5283         }
5284
5285         /* Process any work delayed until the end of the batch */
5286         dev = list_first_entry(head, struct net_device, unreg_list);
5287         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5288
5289         synchronize_net();
5290
5291         list_for_each_entry(dev, head, unreg_list)
5292                 dev_put(dev);
5293 }
5294
5295 static void rollback_registered(struct net_device *dev)
5296 {
5297         LIST_HEAD(single);
5298
5299         list_add(&dev->unreg_list, &single);
5300         rollback_registered_many(&single);
5301         list_del(&single);
5302 }
5303
5304 static netdev_features_t netdev_fix_features(struct net_device *dev,
5305         netdev_features_t features)
5306 {
5307         /* Fix illegal checksum combinations */
5308         if ((features & NETIF_F_HW_CSUM) &&
5309             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5310                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5311                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5312         }
5313
5314         /* Fix illegal SG+CSUM combinations. */
5315         if ((features & NETIF_F_SG) &&
5316             !(features & NETIF_F_ALL_CSUM)) {
5317                 netdev_dbg(dev,
5318                         "Dropping NETIF_F_SG since no checksum feature.\n");
5319                 features &= ~NETIF_F_SG;
5320         }
5321
5322         /* TSO requires that SG is present as well. */
5323         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5324                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5325                 features &= ~NETIF_F_ALL_TSO;
5326         }
5327
5328         /* TSO ECN requires that TSO is present as well. */
5329         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5330                 features &= ~NETIF_F_TSO_ECN;
5331
5332         /* Software GSO depends on SG. */
5333         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5334                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5335                 features &= ~NETIF_F_GSO;
5336         }
5337
5338         /* UFO needs SG and checksumming */
5339         if (features & NETIF_F_UFO) {
5340                 /* maybe split UFO into V4 and V6? */
5341                 if (!((features & NETIF_F_GEN_CSUM) ||
5342                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5343                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5344                         netdev_dbg(dev,
5345                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5346                         features &= ~NETIF_F_UFO;
5347                 }
5348
5349                 if (!(features & NETIF_F_SG)) {
5350                         netdev_dbg(dev,
5351                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5352                         features &= ~NETIF_F_UFO;
5353                 }
5354         }
5355
5356         return features;
5357 }
5358
5359 int __netdev_update_features(struct net_device *dev)
5360 {
5361         netdev_features_t features;
5362         int err = 0;
5363
5364         ASSERT_RTNL();
5365
5366         features = netdev_get_wanted_features(dev);
5367
5368         if (dev->netdev_ops->ndo_fix_features)
5369                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5370
5371         /* driver might be less strict about feature dependencies */
5372         features = netdev_fix_features(dev, features);
5373
5374         if (dev->features == features)
5375                 return 0;
5376
5377         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5378                 &dev->features, &features);
5379
5380         if (dev->netdev_ops->ndo_set_features)
5381                 err = dev->netdev_ops->ndo_set_features(dev, features);
5382
5383         if (unlikely(err < 0)) {
5384                 netdev_err(dev,
5385                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5386                         err, &features, &dev->features);
5387                 return -1;
5388         }
5389
5390         if (!err)
5391                 dev->features = features;
5392
5393         return 1;
5394 }
5395
5396 /**
5397  *      netdev_update_features - recalculate device features
5398  *      @dev: the device to check
5399  *
5400  *      Recalculate dev->features set and send notifications if it
5401  *      has changed. Should be called after driver or hardware dependent
5402  *      conditions might have changed that influence the features.
5403  */
5404 void netdev_update_features(struct net_device *dev)
5405 {
5406         if (__netdev_update_features(dev))
5407                 netdev_features_change(dev);
5408 }
5409 EXPORT_SYMBOL(netdev_update_features);
5410
5411 /**
5412  *      netdev_change_features - recalculate device features
5413  *      @dev: the device to check
5414  *
5415  *      Recalculate dev->features set and send notifications even
5416  *      if they have not changed. Should be called instead of
5417  *      netdev_update_features() if also dev->vlan_features might
5418  *      have changed to allow the changes to be propagated to stacked
5419  *      VLAN devices.
5420  */
5421 void netdev_change_features(struct net_device *dev)
5422 {
5423         __netdev_update_features(dev);
5424         netdev_features_change(dev);
5425 }
5426 EXPORT_SYMBOL(netdev_change_features);
5427
5428 /**
5429  *      netif_stacked_transfer_operstate -      transfer operstate
5430  *      @rootdev: the root or lower level device to transfer state from
5431  *      @dev: the device to transfer operstate to
5432  *
5433  *      Transfer operational state from root to device. This is normally
5434  *      called when a stacking relationship exists between the root
5435  *      device and the device(a leaf device).
5436  */
5437 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5438                                         struct net_device *dev)
5439 {
5440         if (rootdev->operstate == IF_OPER_DORMANT)
5441                 netif_dormant_on(dev);
5442         else
5443                 netif_dormant_off(dev);
5444
5445         if (netif_carrier_ok(rootdev)) {
5446                 if (!netif_carrier_ok(dev))
5447                         netif_carrier_on(dev);
5448         } else {
5449                 if (netif_carrier_ok(dev))
5450                         netif_carrier_off(dev);
5451         }
5452 }
5453 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5454
5455 #ifdef CONFIG_RPS
5456 static int netif_alloc_rx_queues(struct net_device *dev)
5457 {
5458         unsigned int i, count = dev->num_rx_queues;
5459         struct netdev_rx_queue *rx;
5460
5461         BUG_ON(count < 1);
5462
5463         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5464         if (!rx) {
5465                 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5466                 return -ENOMEM;
5467         }
5468         dev->_rx = rx;
5469
5470         for (i = 0; i < count; i++)
5471                 rx[i].dev = dev;
5472         return 0;
5473 }
5474 #endif
5475
5476 static void netdev_init_one_queue(struct net_device *dev,
5477                                   struct netdev_queue *queue, void *_unused)
5478 {
5479         /* Initialize queue lock */
5480         spin_lock_init(&queue->_xmit_lock);
5481         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5482         queue->xmit_lock_owner = -1;
5483         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5484         queue->dev = dev;
5485 #ifdef CONFIG_BQL
5486         dql_init(&queue->dql, HZ);
5487 #endif
5488 }
5489
5490 static int netif_alloc_netdev_queues(struct net_device *dev)
5491 {
5492         unsigned int count = dev->num_tx_queues;
5493         struct netdev_queue *tx;
5494
5495         BUG_ON(count < 1);
5496
5497         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5498         if (!tx) {
5499                 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5500                 return -ENOMEM;
5501         }
5502         dev->_tx = tx;
5503
5504         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5505         spin_lock_init(&dev->tx_global_lock);
5506
5507         return 0;
5508 }
5509
5510 /**
5511  *      register_netdevice      - register a network device
5512  *      @dev: device to register
5513  *
5514  *      Take a completed network device structure and add it to the kernel
5515  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5516  *      chain. 0 is returned on success. A negative errno code is returned
5517  *      on a failure to set up the device, or if the name is a duplicate.
5518  *
5519  *      Callers must hold the rtnl semaphore. You may want
5520  *      register_netdev() instead of this.
5521  *
5522  *      BUGS:
5523  *      The locking appears insufficient to guarantee two parallel registers
5524  *      will not get the same name.
5525  */
5526
5527 int register_netdevice(struct net_device *dev)
5528 {
5529         int ret;
5530         struct net *net = dev_net(dev);
5531
5532         BUG_ON(dev_boot_phase);
5533         ASSERT_RTNL();
5534
5535         might_sleep();
5536
5537         /* When net_device's are persistent, this will be fatal. */
5538         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5539         BUG_ON(!net);
5540
5541         spin_lock_init(&dev->addr_list_lock);
5542         netdev_set_addr_lockdep_class(dev);
5543
5544         dev->iflink = -1;
5545
5546         ret = dev_get_valid_name(dev, dev->name);
5547         if (ret < 0)
5548                 goto out;
5549
5550         /* Init, if this function is available */
5551         if (dev->netdev_ops->ndo_init) {
5552                 ret = dev->netdev_ops->ndo_init(dev);
5553                 if (ret) {
5554                         if (ret > 0)
5555                                 ret = -EIO;
5556                         goto out;
5557                 }
5558         }
5559
5560         dev->ifindex = dev_new_index(net);
5561         if (dev->iflink == -1)
5562                 dev->iflink = dev->ifindex;
5563
5564         /* Transfer changeable features to wanted_features and enable
5565          * software offloads (GSO and GRO).
5566          */
5567         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5568         dev->features |= NETIF_F_SOFT_FEATURES;
5569         dev->wanted_features = dev->features & dev->hw_features;
5570
5571         /* Turn on no cache copy if HW is doing checksum */
5572         if (!(dev->flags & IFF_LOOPBACK)) {
5573                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5574                 if (dev->features & NETIF_F_ALL_CSUM) {
5575                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5576                         dev->features |= NETIF_F_NOCACHE_COPY;
5577                 }
5578         }
5579
5580         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5581          */
5582         dev->vlan_features |= NETIF_F_HIGHDMA;
5583
5584         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5585         ret = notifier_to_errno(ret);
5586         if (ret)
5587                 goto err_uninit;
5588
5589         ret = netdev_register_kobject(dev);
5590         if (ret)
5591                 goto err_uninit;
5592         dev->reg_state = NETREG_REGISTERED;
5593
5594         __netdev_update_features(dev);
5595
5596         /*
5597          *      Default initial state at registry is that the
5598          *      device is present.
5599          */
5600
5601         set_bit(__LINK_STATE_PRESENT, &dev->state);
5602
5603         dev_init_scheduler(dev);
5604         dev_hold(dev);
5605         list_netdevice(dev);
5606
5607         /* Notify protocols, that a new device appeared. */
5608         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5609         ret = notifier_to_errno(ret);
5610         if (ret) {
5611                 rollback_registered(dev);
5612                 dev->reg_state = NETREG_UNREGISTERED;
5613         }
5614         /*
5615          *      Prevent userspace races by waiting until the network
5616          *      device is fully setup before sending notifications.
5617          */
5618         if (!dev->rtnl_link_ops ||
5619             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5620                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5621
5622 out:
5623         return ret;
5624
5625 err_uninit:
5626         if (dev->netdev_ops->ndo_uninit)
5627                 dev->netdev_ops->ndo_uninit(dev);
5628         goto out;
5629 }
5630 EXPORT_SYMBOL(register_netdevice);
5631
5632 /**
5633  *      init_dummy_netdev       - init a dummy network device for NAPI
5634  *      @dev: device to init
5635  *
5636  *      This takes a network device structure and initialize the minimum
5637  *      amount of fields so it can be used to schedule NAPI polls without
5638  *      registering a full blown interface. This is to be used by drivers
5639  *      that need to tie several hardware interfaces to a single NAPI
5640  *      poll scheduler due to HW limitations.
5641  */
5642 int init_dummy_netdev(struct net_device *dev)
5643 {
5644         /* Clear everything. Note we don't initialize spinlocks
5645          * are they aren't supposed to be taken by any of the
5646          * NAPI code and this dummy netdev is supposed to be
5647          * only ever used for NAPI polls
5648          */
5649         memset(dev, 0, sizeof(struct net_device));
5650
5651         /* make sure we BUG if trying to hit standard
5652          * register/unregister code path
5653          */
5654         dev->reg_state = NETREG_DUMMY;
5655
5656         /* NAPI wants this */
5657         INIT_LIST_HEAD(&dev->napi_list);
5658
5659         /* a dummy interface is started by default */
5660         set_bit(__LINK_STATE_PRESENT, &dev->state);
5661         set_bit(__LINK_STATE_START, &dev->state);
5662
5663         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5664          * because users of this 'device' dont need to change
5665          * its refcount.
5666          */
5667
5668         return 0;
5669 }
5670 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5671
5672
5673 /**
5674  *      register_netdev - register a network device
5675  *      @dev: device to register
5676  *
5677  *      Take a completed network device structure and add it to the kernel
5678  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5679  *      chain. 0 is returned on success. A negative errno code is returned
5680  *      on a failure to set up the device, or if the name is a duplicate.
5681  *
5682  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5683  *      and expands the device name if you passed a format string to
5684  *      alloc_netdev.
5685  */
5686 int register_netdev(struct net_device *dev)
5687 {
5688         int err;
5689
5690         rtnl_lock();
5691         err = register_netdevice(dev);
5692         rtnl_unlock();
5693         return err;
5694 }
5695 EXPORT_SYMBOL(register_netdev);
5696
5697 int netdev_refcnt_read(const struct net_device *dev)
5698 {
5699         int i, refcnt = 0;
5700
5701         for_each_possible_cpu(i)
5702                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5703         return refcnt;
5704 }
5705 EXPORT_SYMBOL(netdev_refcnt_read);
5706
5707 /*
5708  * netdev_wait_allrefs - wait until all references are gone.
5709  *
5710  * This is called when unregistering network devices.
5711  *
5712  * Any protocol or device that holds a reference should register
5713  * for netdevice notification, and cleanup and put back the
5714  * reference if they receive an UNREGISTER event.
5715  * We can get stuck here if buggy protocols don't correctly
5716  * call dev_put.
5717  */
5718 static void netdev_wait_allrefs(struct net_device *dev)
5719 {
5720         unsigned long rebroadcast_time, warning_time;
5721         int refcnt;
5722
5723         linkwatch_forget_dev(dev);
5724
5725         rebroadcast_time = warning_time = jiffies;
5726         refcnt = netdev_refcnt_read(dev);
5727
5728         while (refcnt != 0) {
5729                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5730                         rtnl_lock();
5731
5732                         /* Rebroadcast unregister notification */
5733                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5734                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5735                          * should have already handle it the first time */
5736
5737                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5738                                      &dev->state)) {
5739                                 /* We must not have linkwatch events
5740                                  * pending on unregister. If this
5741                                  * happens, we simply run the queue
5742                                  * unscheduled, resulting in a noop
5743                                  * for this device.
5744                                  */
5745                                 linkwatch_run_queue();
5746                         }
5747
5748                         __rtnl_unlock();
5749
5750                         rebroadcast_time = jiffies;
5751                 }
5752
5753                 msleep(250);
5754
5755                 refcnt = netdev_refcnt_read(dev);
5756
5757                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5758                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5759                                  dev->name, refcnt);
5760                         warning_time = jiffies;
5761                 }
5762         }
5763 }
5764
5765 /* The sequence is:
5766  *
5767  *      rtnl_lock();
5768  *      ...
5769  *      register_netdevice(x1);
5770  *      register_netdevice(x2);
5771  *      ...
5772  *      unregister_netdevice(y1);
5773  *      unregister_netdevice(y2);
5774  *      ...
5775  *      rtnl_unlock();
5776  *      free_netdev(y1);
5777  *      free_netdev(y2);
5778  *
5779  * We are invoked by rtnl_unlock().
5780  * This allows us to deal with problems:
5781  * 1) We can delete sysfs objects which invoke hotplug
5782  *    without deadlocking with linkwatch via keventd.
5783  * 2) Since we run with the RTNL semaphore not held, we can sleep
5784  *    safely in order to wait for the netdev refcnt to drop to zero.
5785  *
5786  * We must not return until all unregister events added during
5787  * the interval the lock was held have been completed.
5788  */
5789 void netdev_run_todo(void)
5790 {
5791         struct list_head list;
5792
5793         /* Snapshot list, allow later requests */
5794         list_replace_init(&net_todo_list, &list);
5795
5796         __rtnl_unlock();
5797
5798         /* Wait for rcu callbacks to finish before attempting to drain
5799          * the device list.  This usually avoids a 250ms wait.
5800          */
5801         if (!list_empty(&list))
5802                 rcu_barrier();
5803
5804         while (!list_empty(&list)) {
5805                 struct net_device *dev
5806                         = list_first_entry(&list, struct net_device, todo_list);
5807                 list_del(&dev->todo_list);
5808
5809                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5810                         pr_err("network todo '%s' but state %d\n",
5811                                dev->name, dev->reg_state);
5812                         dump_stack();
5813                         continue;
5814                 }
5815
5816                 dev->reg_state = NETREG_UNREGISTERED;
5817
5818                 on_each_cpu(flush_backlog, dev, 1);
5819
5820                 netdev_wait_allrefs(dev);
5821
5822                 /* paranoia */
5823                 BUG_ON(netdev_refcnt_read(dev));
5824                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5825                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5826                 WARN_ON(dev->dn_ptr);
5827
5828                 if (dev->destructor)
5829                         dev->destructor(dev);
5830
5831                 /* Free network device */
5832                 kobject_put(&dev->dev.kobj);
5833         }
5834 }
5835
5836 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5837  * fields in the same order, with only the type differing.
5838  */
5839 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5840                              const struct net_device_stats *netdev_stats)
5841 {
5842 #if BITS_PER_LONG == 64
5843         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5844         memcpy(stats64, netdev_stats, sizeof(*stats64));
5845 #else
5846         size_t i, n = sizeof(*stats64) / sizeof(u64);
5847         const unsigned long *src = (const unsigned long *)netdev_stats;
5848         u64 *dst = (u64 *)stats64;
5849
5850         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5851                      sizeof(*stats64) / sizeof(u64));
5852         for (i = 0; i < n; i++)
5853                 dst[i] = src[i];
5854 #endif
5855 }
5856 EXPORT_SYMBOL(netdev_stats_to_stats64);
5857
5858 /**
5859  *      dev_get_stats   - get network device statistics
5860  *      @dev: device to get statistics from
5861  *      @storage: place to store stats
5862  *
5863  *      Get network statistics from device. Return @storage.
5864  *      The device driver may provide its own method by setting
5865  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5866  *      otherwise the internal statistics structure is used.
5867  */
5868 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5869                                         struct rtnl_link_stats64 *storage)
5870 {
5871         const struct net_device_ops *ops = dev->netdev_ops;
5872
5873         if (ops->ndo_get_stats64) {
5874                 memset(storage, 0, sizeof(*storage));
5875                 ops->ndo_get_stats64(dev, storage);
5876         } else if (ops->ndo_get_stats) {
5877                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5878         } else {
5879                 netdev_stats_to_stats64(storage, &dev->stats);
5880         }
5881         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5882         return storage;
5883 }
5884 EXPORT_SYMBOL(dev_get_stats);
5885
5886 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5887 {
5888         struct netdev_queue *queue = dev_ingress_queue(dev);
5889
5890 #ifdef CONFIG_NET_CLS_ACT
5891         if (queue)
5892                 return queue;
5893         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5894         if (!queue)
5895                 return NULL;
5896         netdev_init_one_queue(dev, queue, NULL);
5897         queue->qdisc = &noop_qdisc;
5898         queue->qdisc_sleeping = &noop_qdisc;
5899         rcu_assign_pointer(dev->ingress_queue, queue);
5900 #endif
5901         return queue;
5902 }
5903
5904 /**
5905  *      alloc_netdev_mqs - allocate network device
5906  *      @sizeof_priv:   size of private data to allocate space for
5907  *      @name:          device name format string
5908  *      @setup:         callback to initialize device
5909  *      @txqs:          the number of TX subqueues to allocate
5910  *      @rxqs:          the number of RX subqueues to allocate
5911  *
5912  *      Allocates a struct net_device with private data area for driver use
5913  *      and performs basic initialization.  Also allocates subquue structs
5914  *      for each queue on the device.
5915  */
5916 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5917                 void (*setup)(struct net_device *),
5918                 unsigned int txqs, unsigned int rxqs)
5919 {
5920         struct net_device *dev;
5921         size_t alloc_size;
5922         struct net_device *p;
5923
5924         BUG_ON(strlen(name) >= sizeof(dev->name));
5925
5926         if (txqs < 1) {
5927                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5928                 return NULL;
5929         }
5930
5931 #ifdef CONFIG_RPS
5932         if (rxqs < 1) {
5933                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5934                 return NULL;
5935         }
5936 #endif
5937
5938         alloc_size = sizeof(struct net_device);
5939         if (sizeof_priv) {
5940                 /* ensure 32-byte alignment of private area */
5941                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5942                 alloc_size += sizeof_priv;
5943         }
5944         /* ensure 32-byte alignment of whole construct */
5945         alloc_size += NETDEV_ALIGN - 1;
5946
5947         p = kzalloc(alloc_size, GFP_KERNEL);
5948         if (!p) {
5949                 pr_err("alloc_netdev: Unable to allocate device\n");
5950                 return NULL;
5951         }
5952
5953         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5954         dev->padded = (char *)dev - (char *)p;
5955
5956         dev->pcpu_refcnt = alloc_percpu(int);
5957         if (!dev->pcpu_refcnt)
5958                 goto free_p;
5959
5960         if (dev_addr_init(dev))
5961                 goto free_pcpu;
5962
5963         dev_mc_init(dev);
5964         dev_uc_init(dev);
5965
5966         dev_net_set(dev, &init_net);
5967
5968         dev->gso_max_size = GSO_MAX_SIZE;
5969
5970         INIT_LIST_HEAD(&dev->napi_list);
5971         INIT_LIST_HEAD(&dev->unreg_list);
5972         INIT_LIST_HEAD(&dev->link_watch_list);
5973         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5974         setup(dev);
5975
5976         dev->num_tx_queues = txqs;
5977         dev->real_num_tx_queues = txqs;
5978         if (netif_alloc_netdev_queues(dev))
5979                 goto free_all;
5980
5981 #ifdef CONFIG_RPS
5982         dev->num_rx_queues = rxqs;
5983         dev->real_num_rx_queues = rxqs;
5984         if (netif_alloc_rx_queues(dev))
5985                 goto free_all;
5986 #endif
5987
5988         strcpy(dev->name, name);
5989         dev->group = INIT_NETDEV_GROUP;
5990         return dev;
5991
5992 free_all:
5993         free_netdev(dev);
5994         return NULL;
5995
5996 free_pcpu:
5997         free_percpu(dev->pcpu_refcnt);
5998         kfree(dev->_tx);
5999 #ifdef CONFIG_RPS
6000         kfree(dev->_rx);
6001 #endif
6002
6003 free_p:
6004         kfree(p);
6005         return NULL;
6006 }
6007 EXPORT_SYMBOL(alloc_netdev_mqs);
6008
6009 /**
6010  *      free_netdev - free network device
6011  *      @dev: device
6012  *
6013  *      This function does the last stage of destroying an allocated device
6014  *      interface. The reference to the device object is released.
6015  *      If this is the last reference then it will be freed.
6016  */
6017 void free_netdev(struct net_device *dev)
6018 {
6019         struct napi_struct *p, *n;
6020
6021         release_net(dev_net(dev));
6022
6023         kfree(dev->_tx);
6024 #ifdef CONFIG_RPS
6025         kfree(dev->_rx);
6026 #endif
6027
6028         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6029
6030         /* Flush device addresses */
6031         dev_addr_flush(dev);
6032
6033         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6034                 netif_napi_del(p);
6035
6036         free_percpu(dev->pcpu_refcnt);
6037         dev->pcpu_refcnt = NULL;
6038
6039         /*  Compatibility with error handling in drivers */
6040         if (dev->reg_state == NETREG_UNINITIALIZED) {
6041                 kfree((char *)dev - dev->padded);
6042                 return;
6043         }
6044
6045         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6046         dev->reg_state = NETREG_RELEASED;
6047
6048         /* will free via device release */
6049         put_device(&dev->dev);
6050 }
6051 EXPORT_SYMBOL(free_netdev);
6052
6053 /**
6054  *      synchronize_net -  Synchronize with packet receive processing
6055  *
6056  *      Wait for packets currently being received to be done.
6057  *      Does not block later packets from starting.
6058  */
6059 void synchronize_net(void)
6060 {
6061         might_sleep();
6062         if (rtnl_is_locked())
6063                 synchronize_rcu_expedited();
6064         else
6065                 synchronize_rcu();
6066 }
6067 EXPORT_SYMBOL(synchronize_net);
6068
6069 /**
6070  *      unregister_netdevice_queue - remove device from the kernel
6071  *      @dev: device
6072  *      @head: list
6073  *
6074  *      This function shuts down a device interface and removes it
6075  *      from the kernel tables.
6076  *      If head not NULL, device is queued to be unregistered later.
6077  *
6078  *      Callers must hold the rtnl semaphore.  You may want
6079  *      unregister_netdev() instead of this.
6080  */
6081
6082 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6083 {
6084         ASSERT_RTNL();
6085
6086         if (head) {
6087                 list_move_tail(&dev->unreg_list, head);
6088         } else {
6089                 rollback_registered(dev);
6090                 /* Finish processing unregister after unlock */
6091                 net_set_todo(dev);
6092         }
6093 }
6094 EXPORT_SYMBOL(unregister_netdevice_queue);
6095
6096 /**
6097  *      unregister_netdevice_many - unregister many devices
6098  *      @head: list of devices
6099  */
6100 void unregister_netdevice_many(struct list_head *head)
6101 {
6102         struct net_device *dev;
6103
6104         if (!list_empty(head)) {
6105                 rollback_registered_many(head);
6106                 list_for_each_entry(dev, head, unreg_list)
6107                         net_set_todo(dev);
6108         }
6109 }
6110 EXPORT_SYMBOL(unregister_netdevice_many);
6111
6112 /**
6113  *      unregister_netdev - remove device from the kernel
6114  *      @dev: device
6115  *
6116  *      This function shuts down a device interface and removes it
6117  *      from the kernel tables.
6118  *
6119  *      This is just a wrapper for unregister_netdevice that takes
6120  *      the rtnl semaphore.  In general you want to use this and not
6121  *      unregister_netdevice.
6122  */
6123 void unregister_netdev(struct net_device *dev)
6124 {
6125         rtnl_lock();
6126         unregister_netdevice(dev);
6127         rtnl_unlock();
6128 }
6129 EXPORT_SYMBOL(unregister_netdev);
6130
6131 /**
6132  *      dev_change_net_namespace - move device to different nethost namespace
6133  *      @dev: device
6134  *      @net: network namespace
6135  *      @pat: If not NULL name pattern to try if the current device name
6136  *            is already taken in the destination network namespace.
6137  *
6138  *      This function shuts down a device interface and moves it
6139  *      to a new network namespace. On success 0 is returned, on
6140  *      a failure a netagive errno code is returned.
6141  *
6142  *      Callers must hold the rtnl semaphore.
6143  */
6144
6145 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6146 {
6147         int err;
6148
6149         ASSERT_RTNL();
6150
6151         /* Don't allow namespace local devices to be moved. */
6152         err = -EINVAL;
6153         if (dev->features & NETIF_F_NETNS_LOCAL)
6154                 goto out;
6155
6156         /* Ensure the device has been registrered */
6157         err = -EINVAL;
6158         if (dev->reg_state != NETREG_REGISTERED)
6159                 goto out;
6160
6161         /* Get out if there is nothing todo */
6162         err = 0;
6163         if (net_eq(dev_net(dev), net))
6164                 goto out;
6165
6166         /* Pick the destination device name, and ensure
6167          * we can use it in the destination network namespace.
6168          */
6169         err = -EEXIST;
6170         if (__dev_get_by_name(net, dev->name)) {
6171                 /* We get here if we can't use the current device name */
6172                 if (!pat)
6173                         goto out;
6174                 if (dev_get_valid_name(dev, pat) < 0)
6175                         goto out;
6176         }
6177
6178         /*
6179          * And now a mini version of register_netdevice unregister_netdevice.
6180          */
6181
6182         /* If device is running close it first. */
6183         dev_close(dev);
6184
6185         /* And unlink it from device chain */
6186         err = -ENODEV;
6187         unlist_netdevice(dev);
6188
6189         synchronize_net();
6190
6191         /* Shutdown queueing discipline. */
6192         dev_shutdown(dev);
6193
6194         /* Notify protocols, that we are about to destroy
6195            this device. They should clean all the things.
6196
6197            Note that dev->reg_state stays at NETREG_REGISTERED.
6198            This is wanted because this way 8021q and macvlan know
6199            the device is just moving and can keep their slaves up.
6200         */
6201         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6202         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6203         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6204
6205         /*
6206          *      Flush the unicast and multicast chains
6207          */
6208         dev_uc_flush(dev);
6209         dev_mc_flush(dev);
6210
6211         /* Actually switch the network namespace */
6212         dev_net_set(dev, net);
6213
6214         /* If there is an ifindex conflict assign a new one */
6215         if (__dev_get_by_index(net, dev->ifindex)) {
6216                 int iflink = (dev->iflink == dev->ifindex);
6217                 dev->ifindex = dev_new_index(net);
6218                 if (iflink)
6219                         dev->iflink = dev->ifindex;
6220         }
6221
6222         /* Fixup kobjects */
6223         err = device_rename(&dev->dev, dev->name);
6224         WARN_ON(err);
6225
6226         /* Add the device back in the hashes */
6227         list_netdevice(dev);
6228
6229         /* Notify protocols, that a new device appeared. */
6230         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6231
6232         /*
6233          *      Prevent userspace races by waiting until the network
6234          *      device is fully setup before sending notifications.
6235          */
6236         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6237
6238         synchronize_net();
6239         err = 0;
6240 out:
6241         return err;
6242 }
6243 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6244
6245 static int dev_cpu_callback(struct notifier_block *nfb,
6246                             unsigned long action,
6247                             void *ocpu)
6248 {
6249         struct sk_buff **list_skb;
6250         struct sk_buff *skb;
6251         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6252         struct softnet_data *sd, *oldsd;
6253
6254         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6255                 return NOTIFY_OK;
6256
6257         local_irq_disable();
6258         cpu = smp_processor_id();
6259         sd = &per_cpu(softnet_data, cpu);
6260         oldsd = &per_cpu(softnet_data, oldcpu);
6261
6262         /* Find end of our completion_queue. */
6263         list_skb = &sd->completion_queue;
6264         while (*list_skb)
6265                 list_skb = &(*list_skb)->next;
6266         /* Append completion queue from offline CPU. */
6267         *list_skb = oldsd->completion_queue;
6268         oldsd->completion_queue = NULL;
6269
6270         /* Append output queue from offline CPU. */
6271         if (oldsd->output_queue) {
6272                 *sd->output_queue_tailp = oldsd->output_queue;
6273                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6274                 oldsd->output_queue = NULL;
6275                 oldsd->output_queue_tailp = &oldsd->output_queue;
6276         }
6277         /* Append NAPI poll list from offline CPU. */
6278         if (!list_empty(&oldsd->poll_list)) {
6279                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6280                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6281         }
6282
6283         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6284         local_irq_enable();
6285
6286         /* Process offline CPU's input_pkt_queue */
6287         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6288                 netif_rx(skb);
6289                 input_queue_head_incr(oldsd);
6290         }
6291         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6292                 netif_rx(skb);
6293                 input_queue_head_incr(oldsd);
6294         }
6295
6296         return NOTIFY_OK;
6297 }
6298
6299
6300 /**
6301  *      netdev_increment_features - increment feature set by one
6302  *      @all: current feature set
6303  *      @one: new feature set
6304  *      @mask: mask feature set
6305  *
6306  *      Computes a new feature set after adding a device with feature set
6307  *      @one to the master device with current feature set @all.  Will not
6308  *      enable anything that is off in @mask. Returns the new feature set.
6309  */
6310 netdev_features_t netdev_increment_features(netdev_features_t all,
6311         netdev_features_t one, netdev_features_t mask)
6312 {
6313         if (mask & NETIF_F_GEN_CSUM)
6314                 mask |= NETIF_F_ALL_CSUM;
6315         mask |= NETIF_F_VLAN_CHALLENGED;
6316
6317         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6318         all &= one | ~NETIF_F_ALL_FOR_ALL;
6319
6320         /* If one device supports hw checksumming, set for all. */
6321         if (all & NETIF_F_GEN_CSUM)
6322                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6323
6324         return all;
6325 }
6326 EXPORT_SYMBOL(netdev_increment_features);
6327
6328 static struct hlist_head *netdev_create_hash(void)
6329 {
6330         int i;
6331         struct hlist_head *hash;
6332
6333         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6334         if (hash != NULL)
6335                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6336                         INIT_HLIST_HEAD(&hash[i]);
6337
6338         return hash;
6339 }
6340
6341 /* Initialize per network namespace state */
6342 static int __net_init netdev_init(struct net *net)
6343 {
6344         INIT_LIST_HEAD(&net->dev_base_head);
6345
6346         net->dev_name_head = netdev_create_hash();
6347         if (net->dev_name_head == NULL)
6348                 goto err_name;
6349
6350         net->dev_index_head = netdev_create_hash();
6351         if (net->dev_index_head == NULL)
6352                 goto err_idx;
6353
6354         return 0;
6355
6356 err_idx:
6357         kfree(net->dev_name_head);
6358 err_name:
6359         return -ENOMEM;
6360 }
6361
6362 /**
6363  *      netdev_drivername - network driver for the device
6364  *      @dev: network device
6365  *
6366  *      Determine network driver for device.
6367  */
6368 const char *netdev_drivername(const struct net_device *dev)
6369 {
6370         const struct device_driver *driver;
6371         const struct device *parent;
6372         const char *empty = "";
6373
6374         parent = dev->dev.parent;
6375         if (!parent)
6376                 return empty;
6377
6378         driver = parent->driver;
6379         if (driver && driver->name)
6380                 return driver->name;
6381         return empty;
6382 }
6383
6384 int __netdev_printk(const char *level, const struct net_device *dev,
6385                            struct va_format *vaf)
6386 {
6387         int r;
6388
6389         if (dev && dev->dev.parent)
6390                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6391                                netdev_name(dev), vaf);
6392         else if (dev)
6393                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6394         else
6395                 r = printk("%s(NULL net_device): %pV", level, vaf);
6396
6397         return r;
6398 }
6399 EXPORT_SYMBOL(__netdev_printk);
6400
6401 int netdev_printk(const char *level, const struct net_device *dev,
6402                   const char *format, ...)
6403 {
6404         struct va_format vaf;
6405         va_list args;
6406         int r;
6407
6408         va_start(args, format);
6409
6410         vaf.fmt = format;
6411         vaf.va = &args;
6412
6413         r = __netdev_printk(level, dev, &vaf);
6414         va_end(args);
6415
6416         return r;
6417 }
6418 EXPORT_SYMBOL(netdev_printk);
6419
6420 #define define_netdev_printk_level(func, level)                 \
6421 int func(const struct net_device *dev, const char *fmt, ...)    \
6422 {                                                               \
6423         int r;                                                  \
6424         struct va_format vaf;                                   \
6425         va_list args;                                           \
6426                                                                 \
6427         va_start(args, fmt);                                    \
6428                                                                 \
6429         vaf.fmt = fmt;                                          \
6430         vaf.va = &args;                                         \
6431                                                                 \
6432         r = __netdev_printk(level, dev, &vaf);                  \
6433         va_end(args);                                           \
6434                                                                 \
6435         return r;                                               \
6436 }                                                               \
6437 EXPORT_SYMBOL(func);
6438
6439 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6440 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6441 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6442 define_netdev_printk_level(netdev_err, KERN_ERR);
6443 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6444 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6445 define_netdev_printk_level(netdev_info, KERN_INFO);
6446
6447 static void __net_exit netdev_exit(struct net *net)
6448 {
6449         kfree(net->dev_name_head);
6450         kfree(net->dev_index_head);
6451 }
6452
6453 static struct pernet_operations __net_initdata netdev_net_ops = {
6454         .init = netdev_init,
6455         .exit = netdev_exit,
6456 };
6457
6458 static void __net_exit default_device_exit(struct net *net)
6459 {
6460         struct net_device *dev, *aux;
6461         /*
6462          * Push all migratable network devices back to the
6463          * initial network namespace
6464          */
6465         rtnl_lock();
6466         for_each_netdev_safe(net, dev, aux) {
6467                 int err;
6468                 char fb_name[IFNAMSIZ];
6469
6470                 /* Ignore unmoveable devices (i.e. loopback) */
6471                 if (dev->features & NETIF_F_NETNS_LOCAL)
6472                         continue;
6473
6474                 /* Leave virtual devices for the generic cleanup */
6475                 if (dev->rtnl_link_ops)
6476                         continue;
6477
6478                 /* Push remaining network devices to init_net */
6479                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6480                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6481                 if (err) {
6482                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6483                                  __func__, dev->name, err);
6484                         BUG();
6485                 }
6486         }
6487         rtnl_unlock();
6488 }
6489
6490 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6491 {
6492         /* At exit all network devices most be removed from a network
6493          * namespace.  Do this in the reverse order of registration.
6494          * Do this across as many network namespaces as possible to
6495          * improve batching efficiency.
6496          */
6497         struct net_device *dev;
6498         struct net *net;
6499         LIST_HEAD(dev_kill_list);
6500
6501         rtnl_lock();
6502         list_for_each_entry(net, net_list, exit_list) {
6503                 for_each_netdev_reverse(net, dev) {
6504                         if (dev->rtnl_link_ops)
6505                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6506                         else
6507                                 unregister_netdevice_queue(dev, &dev_kill_list);
6508                 }
6509         }
6510         unregister_netdevice_many(&dev_kill_list);
6511         list_del(&dev_kill_list);
6512         rtnl_unlock();
6513 }
6514
6515 static struct pernet_operations __net_initdata default_device_ops = {
6516         .exit = default_device_exit,
6517         .exit_batch = default_device_exit_batch,
6518 };
6519
6520 /*
6521  *      Initialize the DEV module. At boot time this walks the device list and
6522  *      unhooks any devices that fail to initialise (normally hardware not
6523  *      present) and leaves us with a valid list of present and active devices.
6524  *
6525  */
6526
6527 /*
6528  *       This is called single threaded during boot, so no need
6529  *       to take the rtnl semaphore.
6530  */
6531 static int __init net_dev_init(void)
6532 {
6533         int i, rc = -ENOMEM;
6534
6535         BUG_ON(!dev_boot_phase);
6536
6537         if (dev_proc_init())
6538                 goto out;
6539
6540         if (netdev_kobject_init())
6541                 goto out;
6542
6543         INIT_LIST_HEAD(&ptype_all);
6544         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6545                 INIT_LIST_HEAD(&ptype_base[i]);
6546
6547         if (register_pernet_subsys(&netdev_net_ops))
6548                 goto out;
6549
6550         /*
6551          *      Initialise the packet receive queues.
6552          */
6553
6554         for_each_possible_cpu(i) {
6555                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6556
6557                 memset(sd, 0, sizeof(*sd));
6558                 skb_queue_head_init(&sd->input_pkt_queue);
6559                 skb_queue_head_init(&sd->process_queue);
6560                 sd->completion_queue = NULL;
6561                 INIT_LIST_HEAD(&sd->poll_list);
6562                 sd->output_queue = NULL;
6563                 sd->output_queue_tailp = &sd->output_queue;
6564 #ifdef CONFIG_RPS
6565                 sd->csd.func = rps_trigger_softirq;
6566                 sd->csd.info = sd;
6567                 sd->csd.flags = 0;
6568                 sd->cpu = i;
6569 #endif
6570
6571                 sd->backlog.poll = process_backlog;
6572                 sd->backlog.weight = weight_p;
6573                 sd->backlog.gro_list = NULL;
6574                 sd->backlog.gro_count = 0;
6575         }
6576
6577         dev_boot_phase = 0;
6578
6579         /* The loopback device is special if any other network devices
6580          * is present in a network namespace the loopback device must
6581          * be present. Since we now dynamically allocate and free the
6582          * loopback device ensure this invariant is maintained by
6583          * keeping the loopback device as the first device on the
6584          * list of network devices.  Ensuring the loopback devices
6585          * is the first device that appears and the last network device
6586          * that disappears.
6587          */
6588         if (register_pernet_device(&loopback_net_ops))
6589                 goto out;
6590
6591         if (register_pernet_device(&default_device_ops))
6592                 goto out;
6593
6594         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6595         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6596
6597         hotcpu_notifier(dev_cpu_callback, 0);
6598         dst_init();
6599         dev_mcast_init();
6600         rc = 0;
6601 out:
6602         return rc;
6603 }
6604
6605 subsys_initcall(net_dev_init);
6606
6607 static int __init initialize_hashrnd(void)
6608 {
6609         get_random_bytes(&hashrnd, sizeof(hashrnd));
6610         return 0;
6611 }
6612
6613 late_initcall_sync(initialize_hashrnd);
6614