net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136 #include <linux/if_tunnel.h>
 137 #include <linux/if_pppox.h>
 138 #include <linux/ppp_defs.h>
 139 #include <linux/net_tstamp.h>
 140 #include <linux/jump_label.h>
 141
 142 #include "net-sysfs.h"
 143
 144 /* Instead of increasing this, you should create a hash table. */
 145 #define MAX_GRO_SKBS 8
 146
 147 /* This should be increased if a protocol with a bigger head is added. */
 148 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 149
 150 /*
 151  *      The list of packet types we will receive (as opposed to discard)
 152  *      and the routines to invoke.
 153  *
 154  *      Why 16. Because with 16 the only overlap we get on a hash of the
 155  *      low nibble of the protocol value is RARP/SNAP/X.25.
 156  *
 157  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 158  *             sure which should go first, but I bet it won't make much
 159  *             difference if we are running VLANs.  The good news is that
 160  *             this protocol won't be in the list unless compiled in, so
 161  *             the average user (w/out VLANs) will not be adversely affected.
 162  *             --BLG
 163  *
 164  *              0800    IP
 165  *              8100    802.1Q VLAN
 166  *              0001    802.3
 167  *              0002    AX.25
 168  *              0004    802.2
 169  *              8035    RARP
 170  *              0005    SNAP
 171  *              0805    X.25
 172  *              0806    ARP
 173  *              8137    IPX
 174  *              0009    Localtalk
 175  *              86DD    IPv6
 176  */
 177
 178 #define PTYPE_HASH_SIZE (16)
 179 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 180
 181 static DEFINE_SPINLOCK(ptype_lock);
 182 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 183 static struct list_head ptype_all __read_mostly;        /* Taps */
 184
 185 /*
 186  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 187  * semaphore.
 188  *
 189  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 190  *
 191  * Writers must hold the rtnl semaphore while they loop through the
 192  * dev_base_head list, and hold dev_base_lock for writing when they do the
 193  * actual updates.  This allows pure readers to access the list even
 194  * while a writer is preparing to update it.
 195  *
 196  * To put it another way, dev_base_lock is held for writing only to
 197  * protect against pure readers; the rtnl semaphore provides the
 198  * protection against other writers.
 199  *
 200  * See, for example usages, register_netdevice() and
 201  * unregister_netdevice(), which must be called with the rtnl
 202  * semaphore held.
 203  */
 204 DEFINE_RWLOCK(dev_base_lock);
 205 EXPORT_SYMBOL(dev_base_lock);
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 306          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 307          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 308          ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 324          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 325          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 326          "_xmit_VOID", "_xmit_NONE"};
 327
 328 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 330
 331 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 332 {
 333         int i;
 334
 335         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 336                 if (netdev_lock_type[i] == dev_type)
 337                         return i;
 338         /* the last key is used by default */
 339         return ARRAY_SIZE(netdev_lock_type) - 1;
 340 }
 341
 342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 343                                                  unsigned short dev_type)
 344 {
 345         int i;
 346
 347         i = netdev_lock_pos(dev_type);
 348         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 349                                    netdev_lock_name[i]);
 350 }
 351
 352 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 353 {
 354         int i;
 355
 356         i = netdev_lock_pos(dev->type);
 357         lockdep_set_class_and_name(&dev->addr_list_lock,
 358                                    &netdev_addr_lock_key[i],
 359                                    netdev_lock_name[i]);
 360 }
 361 #else
 362 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 363                                                  unsigned short dev_type)
 364 {
 365 }
 366 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 367 {
 368 }
 369 #endif
 370
 371 /*******************************************************************************
 372
 373                 Protocol management and registration routines
 374
 375 *******************************************************************************/
 376
 377 /*
 378  *      Add a protocol ID to the list. Now that the input handler is
 379  *      smarter we can dispense with all the messy stuff that used to be
 380  *      here.
 381  *
 382  *      BEWARE!!! Protocol handlers, mangling input packets,
 383  *      MUST BE last in hash buckets and checking protocol handlers
 384  *      MUST start from promiscuous ptype_all chain in net_bh.
 385  *      It is true now, do not change it.
 386  *      Explanation follows: if protocol handler, mangling packet, will
 387  *      be the first on list, it is not able to sense, that packet
 388  *      is cloned and should be copied-on-write, so that it will
 389  *      change it and subsequent readers will get broken packet.
 390  *                                                      --ANK (980803)
 391  */
 392
 393 static inline struct list_head *ptype_head(const struct packet_type *pt)
 394 {
 395         if (pt->type == htons(ETH_P_ALL))
 396                 return &ptype_all;
 397         else
 398                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 399 }
 400
 401 /**
 402  *      dev_add_pack - add packet handler
 403  *      @pt: packet type declaration
 404  *
 405  *      Add a protocol handler to the networking stack. The passed &packet_type
 406  *      is linked into kernel lists and may not be freed until it has been
 407  *      removed from the kernel lists.
 408  *
 409  *      This call does not sleep therefore it can not
 410  *      guarantee all CPU's that are in middle of receiving packets
 411  *      will see the new packet type (until the next received packet).
 412  */
 413
 414 void dev_add_pack(struct packet_type *pt)
 415 {
 416         struct list_head *head = ptype_head(pt);
 417
 418         spin_lock(&ptype_lock);
 419         list_add_rcu(&pt->list, head);
 420         spin_unlock(&ptype_lock);
 421 }
 422 EXPORT_SYMBOL(dev_add_pack);
 423
 424 /**
 425  *      __dev_remove_pack        - remove packet handler
 426  *      @pt: packet type declaration
 427  *
 428  *      Remove a protocol handler that was previously added to the kernel
 429  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430  *      from the kernel lists and can be freed or reused once this function
 431  *      returns.
 432  *
 433  *      The packet type might still be in use by receivers
 434  *      and must not be freed until after all the CPU's have gone
 435  *      through a quiescent state.
 436  */
 437 void __dev_remove_pack(struct packet_type *pt)
 438 {
 439         struct list_head *head = ptype_head(pt);
 440         struct packet_type *pt1;
 441
 442         spin_lock(&ptype_lock);
 443
 444         list_for_each_entry(pt1, head, list) {
 445                 if (pt == pt1) {
 446                         list_del_rcu(&pt->list);
 447                         goto out;
 448                 }
 449         }
 450
 451         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 452 out:
 453         spin_unlock(&ptype_lock);
 454 }
 455 EXPORT_SYMBOL(__dev_remove_pack);
 456
 457 /**
 458  *      dev_remove_pack  - remove packet handler
 459  *      @pt: packet type declaration
 460  *
 461  *      Remove a protocol handler that was previously added to the kernel
 462  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 463  *      from the kernel lists and can be freed or reused once this function
 464  *      returns.
 465  *
 466  *      This call sleeps to guarantee that no CPU is looking at the packet
 467  *      type after return.
 468  */
 469 void dev_remove_pack(struct packet_type *pt)
 470 {
 471         __dev_remove_pack(pt);
 472
 473         synchronize_net();
 474 }
 475 EXPORT_SYMBOL(dev_remove_pack);
 476
 477 /******************************************************************************
 478
 479                       Device Boot-time Settings Routines
 480
 481 *******************************************************************************/
 482
 483 /* Boot time configuration table */
 484 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 485
 486 /**
 487  *      netdev_boot_setup_add   - add new setup entry
 488  *      @name: name of the device
 489  *      @map: configured settings for the device
 490  *
 491  *      Adds new setup entry to the dev_boot_setup list.  The function
 492  *      returns 0 on error and 1 on success.  This is a generic routine to
 493  *      all netdevices.
 494  */
 495 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 496 {
 497         struct netdev_boot_setup *s;
 498         int i;
 499
 500         s = dev_boot_setup;
 501         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 502                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 503                         memset(s[i].name, 0, sizeof(s[i].name));
 504                         strlcpy(s[i].name, name, IFNAMSIZ);
 505                         memcpy(&s[i].map, map, sizeof(s[i].map));
 506                         break;
 507                 }
 508         }
 509
 510         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 511 }
 512
 513 /**
 514  *      netdev_boot_setup_check - check boot time settings
 515  *      @dev: the netdevice
 516  *
 517  *      Check boot time settings for the device.
 518  *      The found settings are set for the device to be used
 519  *      later in the device probing.
 520  *      Returns 0 if no settings found, 1 if they are.
 521  */
 522 int netdev_boot_setup_check(struct net_device *dev)
 523 {
 524         struct netdev_boot_setup *s = dev_boot_setup;
 525         int i;
 526
 527         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 528                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 529                     !strcmp(dev->name, s[i].name)) {
 530                         dev->irq        = s[i].map.irq;
 531                         dev->base_addr  = s[i].map.base_addr;
 532                         dev->mem_start  = s[i].map.mem_start;
 533                         dev->mem_end    = s[i].map.mem_end;
 534                         return 1;
 535                 }
 536         }
 537         return 0;
 538 }
 539 EXPORT_SYMBOL(netdev_boot_setup_check);
 540
 541
 542 /**
 543  *      netdev_boot_base        - get address from boot time settings
 544  *      @prefix: prefix for network device
 545  *      @unit: id for network device
 546  *
 547  *      Check boot time settings for the base address of device.
 548  *      The found settings are set for the device to be used
 549  *      later in the device probing.
 550  *      Returns 0 if no settings found.
 551  */
 552 unsigned long netdev_boot_base(const char *prefix, int unit)
 553 {
 554         const struct netdev_boot_setup *s = dev_boot_setup;
 555         char name[IFNAMSIZ];
 556         int i;
 557
 558         sprintf(name, "%s%d", prefix, unit);
 559
 560         /*
 561          * If device already registered then return base of 1
 562          * to indicate not to probe for this interface
 563          */
 564         if (__dev_get_by_name(&init_net, name))
 565                 return 1;
 566
 567         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 568                 if (!strcmp(name, s[i].name))
 569                         return s[i].map.base_addr;
 570         return 0;
 571 }
 572
 573 /*
 574  * Saves at boot time configured settings for any netdevice.
 575  */
 576 int __init netdev_boot_setup(char *str)
 577 {
 578         int ints[5];
 579         struct ifmap map;
 580
 581         str = get_options(str, ARRAY_SIZE(ints), ints);
 582         if (!str || !*str)
 583                 return 0;
 584
 585         /* Save settings */
 586         memset(&map, 0, sizeof(map));
 587         if (ints[0] > 0)
 588                 map.irq = ints[1];
 589         if (ints[0] > 1)
 590                 map.base_addr = ints[2];
 591         if (ints[0] > 2)
 592                 map.mem_start = ints[3];
 593         if (ints[0] > 3)
 594                 map.mem_end = ints[4];
 595
 596         /* Add new entry to the list */
 597         return netdev_boot_setup_add(str, &map);
 598 }
 599
 600 __setup("netdev=", netdev_boot_setup);
 601
 602 /*******************************************************************************
 603
 604                             Device Interface Subroutines
 605
 606 *******************************************************************************/
 607
 608 /**
 609  *      __dev_get_by_name       - find a device by its name
 610  *      @net: the applicable net namespace
 611  *      @name: name to find
 612  *
 613  *      Find an interface by name. Must be called under RTNL semaphore
 614  *      or @dev_base_lock. If the name is found a pointer to the device
 615  *      is returned. If the name is not found then %NULL is returned. The
 616  *      reference counters are not incremented so the caller must be
 617  *      careful with locks.
 618  */
 619
 620 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 621 {
 622         struct hlist_node *p;
 623         struct net_device *dev;
 624         struct hlist_head *head = dev_name_hash(net, name);
 625
 626         hlist_for_each_entry(dev, p, head, name_hlist)
 627                 if (!strncmp(dev->name, name, IFNAMSIZ))
 628                         return dev;
 629
 630         return NULL;
 631 }
 632 EXPORT_SYMBOL(__dev_get_by_name);
 633
 634 /**
 635  *      dev_get_by_name_rcu     - find a device by its name
 636  *      @net: the applicable net namespace
 637  *      @name: name to find
 638  *
 639  *      Find an interface by name.
 640  *      If the name is found a pointer to the device is returned.
 641  *      If the name is not found then %NULL is returned.
 642  *      The reference counters are not incremented so the caller must be
 643  *      careful with locks. The caller must hold RCU lock.
 644  */
 645
 646 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 647 {
 648         struct hlist_node *p;
 649         struct net_device *dev;
 650         struct hlist_head *head = dev_name_hash(net, name);
 651
 652         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 653                 if (!strncmp(dev->name, name, IFNAMSIZ))
 654                         return dev;
 655
 656         return NULL;
 657 }
 658 EXPORT_SYMBOL(dev_get_by_name_rcu);
 659
 660 /**
 661  *      dev_get_by_name         - find a device by its name
 662  *      @net: the applicable net namespace
 663  *      @name: name to find
 664  *
 665  *      Find an interface by name. This can be called from any
 666  *      context and does its own locking. The returned handle has
 667  *      the usage count incremented and the caller must use dev_put() to
 668  *      release it when it is no longer needed. %NULL is returned if no
 669  *      matching device is found.
 670  */
 671
 672 struct net_device *dev_get_by_name(struct net *net, const char *name)
 673 {
 674         struct net_device *dev;
 675
 676         rcu_read_lock();
 677         dev = dev_get_by_name_rcu(net, name);
 678         if (dev)
 679                 dev_hold(dev);
 680         rcu_read_unlock();
 681         return dev;
 682 }
 683 EXPORT_SYMBOL(dev_get_by_name);
 684
 685 /**
 686  *      __dev_get_by_index - find a device by its ifindex
 687  *      @net: the applicable net namespace
 688  *      @ifindex: index of device
 689  *
 690  *      Search for an interface by index. Returns %NULL if the device
 691  *      is not found or a pointer to the device. The device has not
 692  *      had its reference counter increased so the caller must be careful
 693  *      about locking. The caller must hold either the RTNL semaphore
 694  *      or @dev_base_lock.
 695  */
 696
 697 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 698 {
 699         struct hlist_node *p;
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_index_hash(net, ifindex);
 702
 703         hlist_for_each_entry(dev, p, head, index_hlist)
 704                 if (dev->ifindex == ifindex)
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(__dev_get_by_index);
 710
 711 /**
 712  *      dev_get_by_index_rcu - find a device by its ifindex
 713  *      @net: the applicable net namespace
 714  *      @ifindex: index of device
 715  *
 716  *      Search for an interface by index. Returns %NULL if the device
 717  *      is not found or a pointer to the device. The device has not
 718  *      had its reference counter increased so the caller must be careful
 719  *      about locking. The caller must hold RCU lock.
 720  */
 721
 722 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 723 {
 724         struct hlist_node *p;
 725         struct net_device *dev;
 726         struct hlist_head *head = dev_index_hash(net, ifindex);
 727
 728         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 729                 if (dev->ifindex == ifindex)
 730                         return dev;
 731
 732         return NULL;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_index_rcu);
 735
 736
 737 /**
 738  *      dev_get_by_index - find a device by its ifindex
 739  *      @net: the applicable net namespace
 740  *      @ifindex: index of device
 741  *
 742  *      Search for an interface by index. Returns NULL if the device
 743  *      is not found or a pointer to the device. The device returned has
 744  *      had a reference added and the pointer is safe until the user calls
 745  *      dev_put to indicate they have finished with it.
 746  */
 747
 748 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751
 752         rcu_read_lock();
 753         dev = dev_get_by_index_rcu(net, ifindex);
 754         if (dev)
 755                 dev_hold(dev);
 756         rcu_read_unlock();
 757         return dev;
 758 }
 759 EXPORT_SYMBOL(dev_get_by_index);
 760
 761 /**
 762  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 763  *      @net: the applicable net namespace
 764  *      @type: media type of device
 765  *      @ha: hardware address
 766  *
 767  *      Search for an interface by MAC address. Returns NULL if the device
 768  *      is not found or a pointer to the device.
 769  *      The caller must hold RCU or RTNL.
 770  *      The returned device has not had its ref count increased
 771  *      and the caller must therefore be careful about locking
 772  *
 773  */
 774
 775 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 776                                        const char *ha)
 777 {
 778         struct net_device *dev;
 779
 780         for_each_netdev_rcu(net, dev)
 781                 if (dev->type == type &&
 782                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 783                         return dev;
 784
 785         return NULL;
 786 }
 787 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 788
 789 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 790 {
 791         struct net_device *dev;
 792
 793         ASSERT_RTNL();
 794         for_each_netdev(net, dev)
 795                 if (dev->type == type)
 796                         return dev;
 797
 798         return NULL;
 799 }
 800 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 801
 802 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 803 {
 804         struct net_device *dev, *ret = NULL;
 805
 806         rcu_read_lock();
 807         for_each_netdev_rcu(net, dev)
 808                 if (dev->type == type) {
 809                         dev_hold(dev);
 810                         ret = dev;
 811                         break;
 812                 }
 813         rcu_read_unlock();
 814         return ret;
 815 }
 816 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 817
 818 /**
 819  *      dev_get_by_flags_rcu - find any device with given flags
 820  *      @net: the applicable net namespace
 821  *      @if_flags: IFF_* values
 822  *      @mask: bitmask of bits in if_flags to check
 823  *
 824  *      Search for any interface with the given flags. Returns NULL if a device
 825  *      is not found or a pointer to the device. Must be called inside
 826  *      rcu_read_lock(), and result refcount is unchanged.
 827  */
 828
 829 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 830                                     unsigned short mask)
 831 {
 832         struct net_device *dev, *ret;
 833
 834         ret = NULL;
 835         for_each_netdev_rcu(net, dev) {
 836                 if (((dev->flags ^ if_flags) & mask) == 0) {
 837                         ret = dev;
 838                         break;
 839                 }
 840         }
 841         return ret;
 842 }
 843 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 844
 845 /**
 846  *      dev_valid_name - check if name is okay for network device
 847  *      @name: name string
 848  *
 849  *      Network device names need to be valid file names to
 850  *      to allow sysfs to work.  We also disallow any kind of
 851  *      whitespace.
 852  */
 853 int dev_valid_name(const char *name)
 854 {
 855         if (*name == '\0')
 856                 return 0;
 857         if (strlen(name) >= IFNAMSIZ)
 858                 return 0;
 859         if (!strcmp(name, ".") || !strcmp(name, ".."))
 860                 return 0;
 861
 862         while (*name) {
 863                 if (*name == '/' || isspace(*name))
 864                         return 0;
 865                 name++;
 866         }
 867         return 1;
 868 }
 869 EXPORT_SYMBOL(dev_valid_name);
 870
 871 /**
 872  *      __dev_alloc_name - allocate a name for a device
 873  *      @net: network namespace to allocate the device name in
 874  *      @name: name format string
 875  *      @buf:  scratch buffer and result name string
 876  *
 877  *      Passed a format string - eg "lt%d" it will try and find a suitable
 878  *      id. It scans list of devices to build up a free map, then chooses
 879  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 880  *      while allocating the name and adding the device in order to avoid
 881  *      duplicates.
 882  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 883  *      Returns the number of the unit assigned or a negative errno code.
 884  */
 885
 886 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 887 {
 888         int i = 0;
 889         const char *p;
 890         const int max_netdevices = 8*PAGE_SIZE;
 891         unsigned long *inuse;
 892         struct net_device *d;
 893
 894         p = strnchr(name, IFNAMSIZ-1, '%');
 895         if (p) {
 896                 /*
 897                  * Verify the string as this thing may have come from
 898                  * the user.  There must be either one "%d" and no other "%"
 899                  * characters.
 900                  */
 901                 if (p[1] != 'd' || strchr(p + 2, '%'))
 902                         return -EINVAL;
 903
 904                 /* Use one page as a bit array of possible slots */
 905                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 906                 if (!inuse)
 907                         return -ENOMEM;
 908
 909                 for_each_netdev(net, d) {
 910                         if (!sscanf(d->name, name, &i))
 911                                 continue;
 912                         if (i < 0 || i >= max_netdevices)
 913                                 continue;
 914
 915                         /*  avoid cases where sscanf is not exact inverse of printf */
 916                         snprintf(buf, IFNAMSIZ, name, i);
 917                         if (!strncmp(buf, d->name, IFNAMSIZ))
 918                                 set_bit(i, inuse);
 919                 }
 920
 921                 i = find_first_zero_bit(inuse, max_netdevices);
 922                 free_page((unsigned long) inuse);
 923         }
 924
 925         if (buf != name)
 926                 snprintf(buf, IFNAMSIZ, name, i);
 927         if (!__dev_get_by_name(net, buf))
 928                 return i;
 929
 930         /* It is possible to run out of possible slots
 931          * when the name is long and there isn't enough space left
 932          * for the digits, or if all bits are used.
 933          */
 934         return -ENFILE;
 935 }
 936
 937 /**
 938  *      dev_alloc_name - allocate a name for a device
 939  *      @dev: device
 940  *      @name: name format string
 941  *
 942  *      Passed a format string - eg "lt%d" it will try and find a suitable
 943  *      id. It scans list of devices to build up a free map, then chooses
 944  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 945  *      while allocating the name and adding the device in order to avoid
 946  *      duplicates.
 947  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 948  *      Returns the number of the unit assigned or a negative errno code.
 949  */
 950
 951 int dev_alloc_name(struct net_device *dev, const char *name)
 952 {
 953         char buf[IFNAMSIZ];
 954         struct net *net;
 955         int ret;
 956
 957         BUG_ON(!dev_net(dev));
 958         net = dev_net(dev);
 959         ret = __dev_alloc_name(net, name, buf);
 960         if (ret >= 0)
 961                 strlcpy(dev->name, buf, IFNAMSIZ);
 962         return ret;
 963 }
 964 EXPORT_SYMBOL(dev_alloc_name);
 965
 966 static int dev_get_valid_name(struct net_device *dev, const char *name)
 967 {
 968         struct net *net;
 969
 970         BUG_ON(!dev_net(dev));
 971         net = dev_net(dev);
 972
 973         if (!dev_valid_name(name))
 974                 return -EINVAL;
 975
 976         if (strchr(name, '%'))
 977                 return dev_alloc_name(dev, name);
 978         else if (__dev_get_by_name(net, name))
 979                 return -EEXIST;
 980         else if (dev->name != name)
 981                 strlcpy(dev->name, name, IFNAMSIZ);
 982
 983         return 0;
 984 }
 985
 986 /**
 987  *      dev_change_name - change name of a device
 988  *      @dev: device
 989  *      @newname: name (or format string) must be at least IFNAMSIZ
 990  *
 991  *      Change name of a device, can pass format strings "eth%d".
 992  *      for wildcarding.
 993  */
 994 int dev_change_name(struct net_device *dev, const char *newname)
 995 {
 996         char oldname[IFNAMSIZ];
 997         int err = 0;
 998         int ret;
 999         struct net *net;
1000
1001         ASSERT_RTNL();
1002         BUG_ON(!dev_net(dev));
1003
1004         net = dev_net(dev);
1005         if (dev->flags & IFF_UP)
1006                 return -EBUSY;
1007
1008         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1009                 return 0;
1010
1011         memcpy(oldname, dev->name, IFNAMSIZ);
1012
1013         err = dev_get_valid_name(dev, newname);
1014         if (err < 0)
1015                 return err;
1016
1017 rollback:
1018         ret = device_rename(&dev->dev, dev->name);
1019         if (ret) {
1020                 memcpy(dev->name, oldname, IFNAMSIZ);
1021                 return ret;
1022         }
1023
1024         write_lock_bh(&dev_base_lock);
1025         hlist_del_rcu(&dev->name_hlist);
1026         write_unlock_bh(&dev_base_lock);
1027
1028         synchronize_rcu();
1029
1030         write_lock_bh(&dev_base_lock);
1031         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1032         write_unlock_bh(&dev_base_lock);
1033
1034         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1035         ret = notifier_to_errno(ret);
1036
1037         if (ret) {
1038                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1039                 if (err >= 0) {
1040                         err = ret;
1041                         memcpy(dev->name, oldname, IFNAMSIZ);
1042                         goto rollback;
1043                 } else {
1044                         printk(KERN_ERR
1045                                "%s: name change rollback failed: %d.\n",
1046                                dev->name, ret);
1047                 }
1048         }
1049
1050         return err;
1051 }
1052
1053 /**
1054  *      dev_set_alias - change ifalias of a device
1055  *      @dev: device
1056  *      @alias: name up to IFALIASZ
1057  *      @len: limit of bytes to copy from info
1058  *
1059  *      Set ifalias for a device,
1060  */
1061 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1062 {
1063         ASSERT_RTNL();
1064
1065         if (len >= IFALIASZ)
1066                 return -EINVAL;
1067
1068         if (!len) {
1069                 if (dev->ifalias) {
1070                         kfree(dev->ifalias);
1071                         dev->ifalias = NULL;
1072                 }
1073                 return 0;
1074         }
1075
1076         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1077         if (!dev->ifalias)
1078                 return -ENOMEM;
1079
1080         strlcpy(dev->ifalias, alias, len+1);
1081         return len;
1082 }
1083
1084
1085 /**
1086  *      netdev_features_change - device changes features
1087  *      @dev: device to cause notification
1088  *
1089  *      Called to indicate a device has changed features.
1090  */
1091 void netdev_features_change(struct net_device *dev)
1092 {
1093         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1094 }
1095 EXPORT_SYMBOL(netdev_features_change);
1096
1097 /**
1098  *      netdev_state_change - device changes state
1099  *      @dev: device to cause notification
1100  *
1101  *      Called to indicate a device has changed state. This function calls
1102  *      the notifier chains for netdev_chain and sends a NEWLINK message
1103  *      to the routing socket.
1104  */
1105 void netdev_state_change(struct net_device *dev)
1106 {
1107         if (dev->flags & IFF_UP) {
1108                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1109                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1110         }
1111 }
1112 EXPORT_SYMBOL(netdev_state_change);
1113
1114 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1115 {
1116         return call_netdevice_notifiers(event, dev);
1117 }
1118 EXPORT_SYMBOL(netdev_bonding_change);
1119
1120 /**
1121  *      dev_load        - load a network module
1122  *      @net: the applicable net namespace
1123  *      @name: name of interface
1124  *
1125  *      If a network interface is not present and the process has suitable
1126  *      privileges this function loads the module. If module loading is not
1127  *      available in this kernel then it becomes a nop.
1128  */
1129
1130 void dev_load(struct net *net, const char *name)
1131 {
1132         struct net_device *dev;
1133         int no_module;
1134
1135         rcu_read_lock();
1136         dev = dev_get_by_name_rcu(net, name);
1137         rcu_read_unlock();
1138
1139         no_module = !dev;
1140         if (no_module && capable(CAP_NET_ADMIN))
1141                 no_module = request_module("netdev-%s", name);
1142         if (no_module && capable(CAP_SYS_MODULE)) {
1143                 if (!request_module("%s", name))
1144                         pr_err("Loading kernel module for a network device "
1145 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1146 "instead\n", name);
1147         }
1148 }
1149 EXPORT_SYMBOL(dev_load);
1150
1151 static int __dev_open(struct net_device *dev)
1152 {
1153         const struct net_device_ops *ops = dev->netdev_ops;
1154         int ret;
1155
1156         ASSERT_RTNL();
1157
1158         if (!netif_device_present(dev))
1159                 return -ENODEV;
1160
1161         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1162         ret = notifier_to_errno(ret);
1163         if (ret)
1164                 return ret;
1165
1166         set_bit(__LINK_STATE_START, &dev->state);
1167
1168         if (ops->ndo_validate_addr)
1169                 ret = ops->ndo_validate_addr(dev);
1170
1171         if (!ret && ops->ndo_open)
1172                 ret = ops->ndo_open(dev);
1173
1174         if (ret)
1175                 clear_bit(__LINK_STATE_START, &dev->state);
1176         else {
1177                 dev->flags |= IFF_UP;
1178                 net_dmaengine_get();
1179                 dev_set_rx_mode(dev);
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         if (dev->flags & IFF_UP)
1203                 return 0;
1204
1205         ret = __dev_open(dev);
1206         if (ret < 0)
1207                 return ret;
1208
1209         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210         call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212         return ret;
1213 }
1214 EXPORT_SYMBOL(dev_open);
1215
1216 static int __dev_close_many(struct list_head *head)
1217 {
1218         struct net_device *dev;
1219
1220         ASSERT_RTNL();
1221         might_sleep();
1222
1223         list_for_each_entry(dev, head, unreg_list) {
1224                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1225
1226                 clear_bit(__LINK_STATE_START, &dev->state);
1227
1228                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1229                  * can be even on different cpu. So just clear netif_running().
1230                  *
1231                  * dev->stop() will invoke napi_disable() on all of it's
1232                  * napi_struct instances on this device.
1233                  */
1234                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1235         }
1236
1237         dev_deactivate_many(head);
1238
1239         list_for_each_entry(dev, head, unreg_list) {
1240                 const struct net_device_ops *ops = dev->netdev_ops;
1241
1242                 /*
1243                  *      Call the device specific close. This cannot fail.
1244                  *      Only if device is UP
1245                  *
1246                  *      We allow it to be called even after a DETACH hot-plug
1247                  *      event.
1248                  */
1249                 if (ops->ndo_stop)
1250                         ops->ndo_stop(dev);
1251
1252                 dev->flags &= ~IFF_UP;
1253                 net_dmaengine_put();
1254         }
1255
1256         return 0;
1257 }
1258
1259 static int __dev_close(struct net_device *dev)
1260 {
1261         int retval;
1262         LIST_HEAD(single);
1263
1264         list_add(&dev->unreg_list, &single);
1265         retval = __dev_close_many(&single);
1266         list_del(&single);
1267         return retval;
1268 }
1269
1270 static int dev_close_many(struct list_head *head)
1271 {
1272         struct net_device *dev, *tmp;
1273         LIST_HEAD(tmp_list);
1274
1275         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1276                 if (!(dev->flags & IFF_UP))
1277                         list_move(&dev->unreg_list, &tmp_list);
1278
1279         __dev_close_many(head);
1280
1281         list_for_each_entry(dev, head, unreg_list) {
1282                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1283                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1284         }
1285
1286         /* rollback_registered_many needs the complete original list */
1287         list_splice(&tmp_list, head);
1288         return 0;
1289 }
1290
1291 /**
1292  *      dev_close - shutdown an interface.
1293  *      @dev: device to shutdown
1294  *
1295  *      This function moves an active device into down state. A
1296  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1297  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1298  *      chain.
1299  */
1300 int dev_close(struct net_device *dev)
1301 {
1302         if (dev->flags & IFF_UP) {
1303                 LIST_HEAD(single);
1304
1305                 list_add(&dev->unreg_list, &single);
1306                 dev_close_many(&single);
1307                 list_del(&single);
1308         }
1309         return 0;
1310 }
1311 EXPORT_SYMBOL(dev_close);
1312
1313
1314 /**
1315  *      dev_disable_lro - disable Large Receive Offload on a device
1316  *      @dev: device
1317  *
1318  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1319  *      called under RTNL.  This is needed if received packets may be
1320  *      forwarded to another interface.
1321  */
1322 void dev_disable_lro(struct net_device *dev)
1323 {
1324         /*
1325          * If we're trying to disable lro on a vlan device
1326          * use the underlying physical device instead
1327          */
1328         if (is_vlan_dev(dev))
1329                 dev = vlan_dev_real_dev(dev);
1330
1331         dev->wanted_features &= ~NETIF_F_LRO;
1332         netdev_update_features(dev);
1333
1334         if (unlikely(dev->features & NETIF_F_LRO))
1335                 netdev_WARN(dev, "failed to disable LRO!\n");
1336 }
1337 EXPORT_SYMBOL(dev_disable_lro);
1338
1339
1340 static int dev_boot_phase = 1;
1341
1342 /**
1343  *      register_netdevice_notifier - register a network notifier block
1344  *      @nb: notifier
1345  *
1346  *      Register a notifier to be called when network device events occur.
1347  *      The notifier passed is linked into the kernel structures and must
1348  *      not be reused until it has been unregistered. A negative errno code
1349  *      is returned on a failure.
1350  *
1351  *      When registered all registration and up events are replayed
1352  *      to the new notifier to allow device to have a race free
1353  *      view of the network device list.
1354  */
1355
1356 int register_netdevice_notifier(struct notifier_block *nb)
1357 {
1358         struct net_device *dev;
1359         struct net_device *last;
1360         struct net *net;
1361         int err;
1362
1363         rtnl_lock();
1364         err = raw_notifier_chain_register(&netdev_chain, nb);
1365         if (err)
1366                 goto unlock;
1367         if (dev_boot_phase)
1368                 goto unlock;
1369         for_each_net(net) {
1370                 for_each_netdev(net, dev) {
1371                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1372                         err = notifier_to_errno(err);
1373                         if (err)
1374                                 goto rollback;
1375
1376                         if (!(dev->flags & IFF_UP))
1377                                 continue;
1378
1379                         nb->notifier_call(nb, NETDEV_UP, dev);
1380                 }
1381         }
1382
1383 unlock:
1384         rtnl_unlock();
1385         return err;
1386
1387 rollback:
1388         last = dev;
1389         for_each_net(net) {
1390                 for_each_netdev(net, dev) {
1391                         if (dev == last)
1392                                 break;
1393
1394                         if (dev->flags & IFF_UP) {
1395                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1396                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1397                         }
1398                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1399                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1400                 }
1401         }
1402
1403         raw_notifier_chain_unregister(&netdev_chain, nb);
1404         goto unlock;
1405 }
1406 EXPORT_SYMBOL(register_netdevice_notifier);
1407
1408 /**
1409  *      unregister_netdevice_notifier - unregister a network notifier block
1410  *      @nb: notifier
1411  *
1412  *      Unregister a notifier previously registered by
1413  *      register_netdevice_notifier(). The notifier is unlinked into the
1414  *      kernel structures and may then be reused. A negative errno code
1415  *      is returned on a failure.
1416  */
1417
1418 int unregister_netdevice_notifier(struct notifier_block *nb)
1419 {
1420         int err;
1421
1422         rtnl_lock();
1423         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1424         rtnl_unlock();
1425         return err;
1426 }
1427 EXPORT_SYMBOL(unregister_netdevice_notifier);
1428
1429 /**
1430  *      call_netdevice_notifiers - call all network notifier blocks
1431  *      @val: value passed unmodified to notifier function
1432  *      @dev: net_device pointer passed unmodified to notifier function
1433  *
1434  *      Call all network notifier blocks.  Parameters and return value
1435  *      are as for raw_notifier_call_chain().
1436  */
1437
1438 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1439 {
1440         ASSERT_RTNL();
1441         return raw_notifier_call_chain(&netdev_chain, val, dev);
1442 }
1443 EXPORT_SYMBOL(call_netdevice_notifiers);
1444
1445 static struct jump_label_key netstamp_needed __read_mostly;
1446
1447 void net_enable_timestamp(void)
1448 {
1449         jump_label_inc(&netstamp_needed);
1450 }
1451 EXPORT_SYMBOL(net_enable_timestamp);
1452
1453 void net_disable_timestamp(void)
1454 {
1455         jump_label_dec(&netstamp_needed);
1456 }
1457 EXPORT_SYMBOL(net_disable_timestamp);
1458
1459 static inline void net_timestamp_set(struct sk_buff *skb)
1460 {
1461         skb->tstamp.tv64 = 0;
1462         if (static_branch(&netstamp_needed))
1463                 __net_timestamp(skb);
1464 }
1465
1466 #define net_timestamp_check(COND, SKB)                  \
1467         if (static_branch(&netstamp_needed)) {          \
1468                 if ((COND) && !(SKB)->tstamp.tv64)      \
1469                         __net_timestamp(SKB);           \
1470         }                                               \
1471
1472 static int net_hwtstamp_validate(struct ifreq *ifr)
1473 {
1474         struct hwtstamp_config cfg;
1475         enum hwtstamp_tx_types tx_type;
1476         enum hwtstamp_rx_filters rx_filter;
1477         int tx_type_valid = 0;
1478         int rx_filter_valid = 0;
1479
1480         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1481                 return -EFAULT;
1482
1483         if (cfg.flags) /* reserved for future extensions */
1484                 return -EINVAL;
1485
1486         tx_type = cfg.tx_type;
1487         rx_filter = cfg.rx_filter;
1488
1489         switch (tx_type) {
1490         case HWTSTAMP_TX_OFF:
1491         case HWTSTAMP_TX_ON:
1492         case HWTSTAMP_TX_ONESTEP_SYNC:
1493                 tx_type_valid = 1;
1494                 break;
1495         }
1496
1497         switch (rx_filter) {
1498         case HWTSTAMP_FILTER_NONE:
1499         case HWTSTAMP_FILTER_ALL:
1500         case HWTSTAMP_FILTER_SOME:
1501         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1502         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1503         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1504         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1505         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1506         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1507         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1508         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1509         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1510         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1511         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1512         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1513                 rx_filter_valid = 1;
1514                 break;
1515         }
1516
1517         if (!tx_type_valid || !rx_filter_valid)
1518                 return -ERANGE;
1519
1520         return 0;
1521 }
1522
1523 static inline bool is_skb_forwardable(struct net_device *dev,
1524                                       struct sk_buff *skb)
1525 {
1526         unsigned int len;
1527
1528         if (!(dev->flags & IFF_UP))
1529                 return false;
1530
1531         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1532         if (skb->len <= len)
1533                 return true;
1534
1535         /* if TSO is enabled, we don't care about the length as the packet
1536          * could be forwarded without being segmented before
1537          */
1538         if (skb_is_gso(skb))
1539                 return true;
1540
1541         return false;
1542 }
1543
1544 /**
1545  * dev_forward_skb - loopback an skb to another netif
1546  *
1547  * @dev: destination network device
1548  * @skb: buffer to forward
1549  *
1550  * return values:
1551  *      NET_RX_SUCCESS  (no congestion)
1552  *      NET_RX_DROP     (packet was dropped, but freed)
1553  *
1554  * dev_forward_skb can be used for injecting an skb from the
1555  * start_xmit function of one device into the receive queue
1556  * of another device.
1557  *
1558  * The receiving device may be in another namespace, so
1559  * we have to clear all information in the skb that could
1560  * impact namespace isolation.
1561  */
1562 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1563 {
1564         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1565                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1566                         atomic_long_inc(&dev->rx_dropped);
1567                         kfree_skb(skb);
1568                         return NET_RX_DROP;
1569                 }
1570         }
1571
1572         skb_orphan(skb);
1573         nf_reset(skb);
1574
1575         if (unlikely(!is_skb_forwardable(dev, skb))) {
1576                 atomic_long_inc(&dev->rx_dropped);
1577                 kfree_skb(skb);
1578                 return NET_RX_DROP;
1579         }
1580         skb_set_dev(skb, dev);
1581         skb->tstamp.tv64 = 0;
1582         skb->pkt_type = PACKET_HOST;
1583         skb->protocol = eth_type_trans(skb, dev);
1584         return netif_rx(skb);
1585 }
1586 EXPORT_SYMBOL_GPL(dev_forward_skb);
1587
1588 static inline int deliver_skb(struct sk_buff *skb,
1589                               struct packet_type *pt_prev,
1590                               struct net_device *orig_dev)
1591 {
1592         atomic_inc(&skb->users);
1593         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1594 }
1595
1596 /*
1597  *      Support routine. Sends outgoing frames to any network
1598  *      taps currently in use.
1599  */
1600
1601 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1602 {
1603         struct packet_type *ptype;
1604         struct sk_buff *skb2 = NULL;
1605         struct packet_type *pt_prev = NULL;
1606
1607         rcu_read_lock();
1608         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1609                 /* Never send packets back to the socket
1610                  * they originated from - MvS (miquels@drinkel.ow.org)
1611                  */
1612                 if ((ptype->dev == dev || !ptype->dev) &&
1613                     (ptype->af_packet_priv == NULL ||
1614                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1615                         if (pt_prev) {
1616                                 deliver_skb(skb2, pt_prev, skb->dev);
1617                                 pt_prev = ptype;
1618                                 continue;
1619                         }
1620
1621                         skb2 = skb_clone(skb, GFP_ATOMIC);
1622                         if (!skb2)
1623                                 break;
1624
1625                         net_timestamp_set(skb2);
1626
1627                         /* skb->nh should be correctly
1628                            set by sender, so that the second statement is
1629                            just protection against buggy protocols.
1630                          */
1631                         skb_reset_mac_header(skb2);
1632
1633                         if (skb_network_header(skb2) < skb2->data ||
1634                             skb2->network_header > skb2->tail) {
1635                                 if (net_ratelimit())
1636                                         printk(KERN_CRIT "protocol %04x is "
1637                                                "buggy, dev %s\n",
1638                                                ntohs(skb2->protocol),
1639                                                dev->name);
1640                                 skb_reset_network_header(skb2);
1641                         }
1642
1643                         skb2->transport_header = skb2->network_header;
1644                         skb2->pkt_type = PACKET_OUTGOING;
1645                         pt_prev = ptype;
1646                 }
1647         }
1648         if (pt_prev)
1649                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1650         rcu_read_unlock();
1651 }
1652
1653 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1654  * @dev: Network device
1655  * @txq: number of queues available
1656  *
1657  * If real_num_tx_queues is changed the tc mappings may no longer be
1658  * valid. To resolve this verify the tc mapping remains valid and if
1659  * not NULL the mapping. With no priorities mapping to this
1660  * offset/count pair it will no longer be used. In the worst case TC0
1661  * is invalid nothing can be done so disable priority mappings. If is
1662  * expected that drivers will fix this mapping if they can before
1663  * calling netif_set_real_num_tx_queues.
1664  */
1665 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1666 {
1667         int i;
1668         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1669
1670         /* If TC0 is invalidated disable TC mapping */
1671         if (tc->offset + tc->count > txq) {
1672                 pr_warning("Number of in use tx queues changed "
1673                            "invalidating tc mappings. Priority "
1674                            "traffic classification disabled!\n");
1675                 dev->num_tc = 0;
1676                 return;
1677         }
1678
1679         /* Invalidated prio to tc mappings set to TC0 */
1680         for (i = 1; i < TC_BITMASK + 1; i++) {
1681                 int q = netdev_get_prio_tc_map(dev, i);
1682
1683                 tc = &dev->tc_to_txq[q];
1684                 if (tc->offset + tc->count > txq) {
1685                         pr_warning("Number of in use tx queues "
1686                                    "changed. Priority %i to tc "
1687                                    "mapping %i is no longer valid "
1688                                    "setting map to 0\n",
1689                                    i, q);
1690                         netdev_set_prio_tc_map(dev, i, 0);
1691                 }
1692         }
1693 }
1694
1695 /*
1696  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1697  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1698  */
1699 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1700 {
1701         int rc;
1702
1703         if (txq < 1 || txq > dev->num_tx_queues)
1704                 return -EINVAL;
1705
1706         if (dev->reg_state == NETREG_REGISTERED ||
1707             dev->reg_state == NETREG_UNREGISTERING) {
1708                 ASSERT_RTNL();
1709
1710                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1711                                                   txq);
1712                 if (rc)
1713                         return rc;
1714
1715                 if (dev->num_tc)
1716                         netif_setup_tc(dev, txq);
1717
1718                 if (txq < dev->real_num_tx_queues)
1719                         qdisc_reset_all_tx_gt(dev, txq);
1720         }
1721
1722         dev->real_num_tx_queues = txq;
1723         return 0;
1724 }
1725 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1726
1727 #ifdef CONFIG_RPS
1728 /**
1729  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1730  *      @dev: Network device
1731  *      @rxq: Actual number of RX queues
1732  *
1733  *      This must be called either with the rtnl_lock held or before
1734  *      registration of the net device.  Returns 0 on success, or a
1735  *      negative error code.  If called before registration, it always
1736  *      succeeds.
1737  */
1738 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1739 {
1740         int rc;
1741
1742         if (rxq < 1 || rxq > dev->num_rx_queues)
1743                 return -EINVAL;
1744
1745         if (dev->reg_state == NETREG_REGISTERED) {
1746                 ASSERT_RTNL();
1747
1748                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1749                                                   rxq);
1750                 if (rc)
1751                         return rc;
1752         }
1753
1754         dev->real_num_rx_queues = rxq;
1755         return 0;
1756 }
1757 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1758 #endif
1759
1760 static inline void __netif_reschedule(struct Qdisc *q)
1761 {
1762         struct softnet_data *sd;
1763         unsigned long flags;
1764
1765         local_irq_save(flags);
1766         sd = &__get_cpu_var(softnet_data);
1767         q->next_sched = NULL;
1768         *sd->output_queue_tailp = q;
1769         sd->output_queue_tailp = &q->next_sched;
1770         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1771         local_irq_restore(flags);
1772 }
1773
1774 void __netif_schedule(struct Qdisc *q)
1775 {
1776         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1777                 __netif_reschedule(q);
1778 }
1779 EXPORT_SYMBOL(__netif_schedule);
1780
1781 void dev_kfree_skb_irq(struct sk_buff *skb)
1782 {
1783         if (atomic_dec_and_test(&skb->users)) {
1784                 struct softnet_data *sd;
1785                 unsigned long flags;
1786
1787                 local_irq_save(flags);
1788                 sd = &__get_cpu_var(softnet_data);
1789                 skb->next = sd->completion_queue;
1790                 sd->completion_queue = skb;
1791                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1792                 local_irq_restore(flags);
1793         }
1794 }
1795 EXPORT_SYMBOL(dev_kfree_skb_irq);
1796
1797 void dev_kfree_skb_any(struct sk_buff *skb)
1798 {
1799         if (in_irq() || irqs_disabled())
1800                 dev_kfree_skb_irq(skb);
1801         else
1802                 dev_kfree_skb(skb);
1803 }
1804 EXPORT_SYMBOL(dev_kfree_skb_any);
1805
1806
1807 /**
1808  * netif_device_detach - mark device as removed
1809  * @dev: network device
1810  *
1811  * Mark device as removed from system and therefore no longer available.
1812  */
1813 void netif_device_detach(struct net_device *dev)
1814 {
1815         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1816             netif_running(dev)) {
1817                 netif_tx_stop_all_queues(dev);
1818         }
1819 }
1820 EXPORT_SYMBOL(netif_device_detach);
1821
1822 /**
1823  * netif_device_attach - mark device as attached
1824  * @dev: network device
1825  *
1826  * Mark device as attached from system and restart if needed.
1827  */
1828 void netif_device_attach(struct net_device *dev)
1829 {
1830         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1831             netif_running(dev)) {
1832                 netif_tx_wake_all_queues(dev);
1833                 __netdev_watchdog_up(dev);
1834         }
1835 }
1836 EXPORT_SYMBOL(netif_device_attach);
1837
1838 /**
1839  * skb_dev_set -- assign a new device to a buffer
1840  * @skb: buffer for the new device
1841  * @dev: network device
1842  *
1843  * If an skb is owned by a device already, we have to reset
1844  * all data private to the namespace a device belongs to
1845  * before assigning it a new device.
1846  */
1847 #ifdef CONFIG_NET_NS
1848 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1849 {
1850         skb_dst_drop(skb);
1851         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1852                 secpath_reset(skb);
1853                 nf_reset(skb);
1854                 skb_init_secmark(skb);
1855                 skb->mark = 0;
1856                 skb->priority = 0;
1857                 skb->nf_trace = 0;
1858                 skb->ipvs_property = 0;
1859 #ifdef CONFIG_NET_SCHED
1860                 skb->tc_index = 0;
1861 #endif
1862         }
1863         skb->dev = dev;
1864 }
1865 EXPORT_SYMBOL(skb_set_dev);
1866 #endif /* CONFIG_NET_NS */
1867
1868 /*
1869  * Invalidate hardware checksum when packet is to be mangled, and
1870  * complete checksum manually on outgoing path.
1871  */
1872 int skb_checksum_help(struct sk_buff *skb)
1873 {
1874         __wsum csum;
1875         int ret = 0, offset;
1876
1877         if (skb->ip_summed == CHECKSUM_COMPLETE)
1878                 goto out_set_summed;
1879
1880         if (unlikely(skb_shinfo(skb)->gso_size)) {
1881                 /* Let GSO fix up the checksum. */
1882                 goto out_set_summed;
1883         }
1884
1885         offset = skb_checksum_start_offset(skb);
1886         BUG_ON(offset >= skb_headlen(skb));
1887         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1888
1889         offset += skb->csum_offset;
1890         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1891
1892         if (skb_cloned(skb) &&
1893             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1894                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1895                 if (ret)
1896                         goto out;
1897         }
1898
1899         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1900 out_set_summed:
1901         skb->ip_summed = CHECKSUM_NONE;
1902 out:
1903         return ret;
1904 }
1905 EXPORT_SYMBOL(skb_checksum_help);
1906
1907 /**
1908  *      skb_gso_segment - Perform segmentation on skb.
1909  *      @skb: buffer to segment
1910  *      @features: features for the output path (see dev->features)
1911  *
1912  *      This function segments the given skb and returns a list of segments.
1913  *
1914  *      It may return NULL if the skb requires no segmentation.  This is
1915  *      only possible when GSO is used for verifying header integrity.
1916  */
1917 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1918 {
1919         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1920         struct packet_type *ptype;
1921         __be16 type = skb->protocol;
1922         int vlan_depth = ETH_HLEN;
1923         int err;
1924
1925         while (type == htons(ETH_P_8021Q)) {
1926                 struct vlan_hdr *vh;
1927
1928                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1929                         return ERR_PTR(-EINVAL);
1930
1931                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1932                 type = vh->h_vlan_encapsulated_proto;
1933                 vlan_depth += VLAN_HLEN;
1934         }
1935
1936         skb_reset_mac_header(skb);
1937         skb->mac_len = skb->network_header - skb->mac_header;
1938         __skb_pull(skb, skb->mac_len);
1939
1940         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1941                 struct net_device *dev = skb->dev;
1942                 struct ethtool_drvinfo info = {};
1943
1944                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1945                         dev->ethtool_ops->get_drvinfo(dev, &info);
1946
1947                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1948                      info.driver, dev ? dev->features : 0L,
1949                      skb->sk ? skb->sk->sk_route_caps : 0L,
1950                      skb->len, skb->data_len, skb->ip_summed);
1951
1952                 if (skb_header_cloned(skb) &&
1953                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1954                         return ERR_PTR(err);
1955         }
1956
1957         rcu_read_lock();
1958         list_for_each_entry_rcu(ptype,
1959                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1960                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1961                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1962                                 err = ptype->gso_send_check(skb);
1963                                 segs = ERR_PTR(err);
1964                                 if (err || skb_gso_ok(skb, features))
1965                                         break;
1966                                 __skb_push(skb, (skb->data -
1967                                                  skb_network_header(skb)));
1968                         }
1969                         segs = ptype->gso_segment(skb, features);
1970                         break;
1971                 }
1972         }
1973         rcu_read_unlock();
1974
1975         __skb_push(skb, skb->data - skb_mac_header(skb));
1976
1977         return segs;
1978 }
1979 EXPORT_SYMBOL(skb_gso_segment);
1980
1981 /* Take action when hardware reception checksum errors are detected. */
1982 #ifdef CONFIG_BUG
1983 void netdev_rx_csum_fault(struct net_device *dev)
1984 {
1985         if (net_ratelimit()) {
1986                 printk(KERN_ERR "%s: hw csum failure.\n",
1987                         dev ? dev->name : "<unknown>");
1988                 dump_stack();
1989         }
1990 }
1991 EXPORT_SYMBOL(netdev_rx_csum_fault);
1992 #endif
1993
1994 /* Actually, we should eliminate this check as soon as we know, that:
1995  * 1. IOMMU is present and allows to map all the memory.
1996  * 2. No high memory really exists on this machine.
1997  */
1998
1999 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2000 {
2001 #ifdef CONFIG_HIGHMEM
2002         int i;
2003         if (!(dev->features & NETIF_F_HIGHDMA)) {
2004                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2005                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2006                         if (PageHighMem(skb_frag_page(frag)))
2007                                 return 1;
2008                 }
2009         }
2010
2011         if (PCI_DMA_BUS_IS_PHYS) {
2012                 struct device *pdev = dev->dev.parent;
2013
2014                 if (!pdev)
2015                         return 0;
2016                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2017                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2018                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2019                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2020                                 return 1;
2021                 }
2022         }
2023 #endif
2024         return 0;
2025 }
2026
2027 struct dev_gso_cb {
2028         void (*destructor)(struct sk_buff *skb);
2029 };
2030
2031 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2032
2033 static void dev_gso_skb_destructor(struct sk_buff *skb)
2034 {
2035         struct dev_gso_cb *cb;
2036
2037         do {
2038                 struct sk_buff *nskb = skb->next;
2039
2040                 skb->next = nskb->next;
2041                 nskb->next = NULL;
2042                 kfree_skb(nskb);
2043         } while (skb->next);
2044
2045         cb = DEV_GSO_CB(skb);
2046         if (cb->destructor)
2047                 cb->destructor(skb);
2048 }
2049
2050 /**
2051  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2052  *      @skb: buffer to segment
2053  *      @features: device features as applicable to this skb
2054  *
2055  *      This function segments the given skb and stores the list of segments
2056  *      in skb->next.
2057  */
2058 static int dev_gso_segment(struct sk_buff *skb, int features)
2059 {
2060         struct sk_buff *segs;
2061
2062         segs = skb_gso_segment(skb, features);
2063
2064         /* Verifying header integrity only. */
2065         if (!segs)
2066                 return 0;
2067
2068         if (IS_ERR(segs))
2069                 return PTR_ERR(segs);
2070
2071         skb->next = segs;
2072         DEV_GSO_CB(skb)->destructor = skb->destructor;
2073         skb->destructor = dev_gso_skb_destructor;
2074
2075         return 0;
2076 }
2077
2078 /*
2079  * Try to orphan skb early, right before transmission by the device.
2080  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2081  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2082  */
2083 static inline void skb_orphan_try(struct sk_buff *skb)
2084 {
2085         struct sock *sk = skb->sk;
2086
2087         if (sk && !skb_shinfo(skb)->tx_flags) {
2088                 /* skb_tx_hash() wont be able to get sk.
2089                  * We copy sk_hash into skb->rxhash
2090                  */
2091                 if (!skb->rxhash)
2092                         skb->rxhash = sk->sk_hash;
2093                 skb_orphan(skb);
2094         }
2095 }
2096
2097 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2098 {
2099         return ((features & NETIF_F_GEN_CSUM) ||
2100                 ((features & NETIF_F_V4_CSUM) &&
2101                  protocol == htons(ETH_P_IP)) ||
2102                 ((features & NETIF_F_V6_CSUM) &&
2103                  protocol == htons(ETH_P_IPV6)) ||
2104                 ((features & NETIF_F_FCOE_CRC) &&
2105                  protocol == htons(ETH_P_FCOE)));
2106 }
2107
2108 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2109 {
2110         if (!can_checksum_protocol(features, protocol)) {
2111                 features &= ~NETIF_F_ALL_CSUM;
2112                 features &= ~NETIF_F_SG;
2113         } else if (illegal_highdma(skb->dev, skb)) {
2114                 features &= ~NETIF_F_SG;
2115         }
2116
2117         return features;
2118 }
2119
2120 u32 netif_skb_features(struct sk_buff *skb)
2121 {
2122         __be16 protocol = skb->protocol;
2123         u32 features = skb->dev->features;
2124
2125         if (protocol == htons(ETH_P_8021Q)) {
2126                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2127                 protocol = veh->h_vlan_encapsulated_proto;
2128         } else if (!vlan_tx_tag_present(skb)) {
2129                 return harmonize_features(skb, protocol, features);
2130         }
2131
2132         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2133
2134         if (protocol != htons(ETH_P_8021Q)) {
2135                 return harmonize_features(skb, protocol, features);
2136         } else {
2137                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2138                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2139                 return harmonize_features(skb, protocol, features);
2140         }
2141 }
2142 EXPORT_SYMBOL(netif_skb_features);
2143
2144 /*
2145  * Returns true if either:
2146  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2147  *      2. skb is fragmented and the device does not support SG, or if
2148  *         at least one of fragments is in highmem and device does not
2149  *         support DMA from it.
2150  */
2151 static inline int skb_needs_linearize(struct sk_buff *skb,
2152                                       int features)
2153 {
2154         return skb_is_nonlinear(skb) &&
2155                         ((skb_has_frag_list(skb) &&
2156                                 !(features & NETIF_F_FRAGLIST)) ||
2157                         (skb_shinfo(skb)->nr_frags &&
2158                                 !(features & NETIF_F_SG)));
2159 }
2160
2161 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2162                         struct netdev_queue *txq)
2163 {
2164         const struct net_device_ops *ops = dev->netdev_ops;
2165         int rc = NETDEV_TX_OK;
2166         unsigned int skb_len;
2167
2168         if (likely(!skb->next)) {
2169                 u32 features;
2170
2171                 /*
2172                  * If device doesn't need skb->dst, release it right now while
2173                  * its hot in this cpu cache
2174                  */
2175                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2176                         skb_dst_drop(skb);
2177
2178                 if (!list_empty(&ptype_all))
2179                         dev_queue_xmit_nit(skb, dev);
2180
2181                 skb_orphan_try(skb);
2182
2183                 features = netif_skb_features(skb);
2184
2185                 if (vlan_tx_tag_present(skb) &&
2186                     !(features & NETIF_F_HW_VLAN_TX)) {
2187                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2188                         if (unlikely(!skb))
2189                                 goto out;
2190
2191                         skb->vlan_tci = 0;
2192                 }
2193
2194                 if (netif_needs_gso(skb, features)) {
2195                         if (unlikely(dev_gso_segment(skb, features)))
2196                                 goto out_kfree_skb;
2197                         if (skb->next)
2198                                 goto gso;
2199                 } else {
2200                         if (skb_needs_linearize(skb, features) &&
2201                             __skb_linearize(skb))
2202                                 goto out_kfree_skb;
2203
2204                         /* If packet is not checksummed and device does not
2205                          * support checksumming for this protocol, complete
2206                          * checksumming here.
2207                          */
2208                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2209                                 skb_set_transport_header(skb,
2210                                         skb_checksum_start_offset(skb));
2211                                 if (!(features & NETIF_F_ALL_CSUM) &&
2212                                      skb_checksum_help(skb))
2213                                         goto out_kfree_skb;
2214                         }
2215                 }
2216
2217                 skb_len = skb->len;
2218                 rc = ops->ndo_start_xmit(skb, dev);
2219                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2220                 if (rc == NETDEV_TX_OK)
2221                         txq_trans_update(txq);
2222                 return rc;
2223         }
2224
2225 gso:
2226         do {
2227                 struct sk_buff *nskb = skb->next;
2228
2229                 skb->next = nskb->next;
2230                 nskb->next = NULL;
2231
2232                 /*
2233                  * If device doesn't need nskb->dst, release it right now while
2234                  * its hot in this cpu cache
2235                  */
2236                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2237                         skb_dst_drop(nskb);
2238
2239                 skb_len = nskb->len;
2240                 rc = ops->ndo_start_xmit(nskb, dev);
2241                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2242                 if (unlikely(rc != NETDEV_TX_OK)) {
2243                         if (rc & ~NETDEV_TX_MASK)
2244                                 goto out_kfree_gso_skb;
2245                         nskb->next = skb->next;
2246                         skb->next = nskb;
2247                         return rc;
2248                 }
2249                 txq_trans_update(txq);
2250                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2251                         return NETDEV_TX_BUSY;
2252         } while (skb->next);
2253
2254 out_kfree_gso_skb:
2255         if (likely(skb->next == NULL))
2256                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2257 out_kfree_skb:
2258         kfree_skb(skb);
2259 out:
2260         return rc;
2261 }
2262
2263 static u32 hashrnd __read_mostly;
2264
2265 /*
2266  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2267  * to be used as a distribution range.
2268  */
2269 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2270                   unsigned int num_tx_queues)
2271 {
2272         u32 hash;
2273         u16 qoffset = 0;
2274         u16 qcount = num_tx_queues;
2275
2276         if (skb_rx_queue_recorded(skb)) {
2277                 hash = skb_get_rx_queue(skb);
2278                 while (unlikely(hash >= num_tx_queues))
2279                         hash -= num_tx_queues;
2280                 return hash;
2281         }
2282
2283         if (dev->num_tc) {
2284                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2285                 qoffset = dev->tc_to_txq[tc].offset;
2286                 qcount = dev->tc_to_txq[tc].count;
2287         }
2288
2289         if (skb->sk && skb->sk->sk_hash)
2290                 hash = skb->sk->sk_hash;
2291         else
2292                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2293         hash = jhash_1word(hash, hashrnd);
2294
2295         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2296 }
2297 EXPORT_SYMBOL(__skb_tx_hash);
2298
2299 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2300 {
2301         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2302                 if (net_ratelimit()) {
2303                         pr_warning("%s selects TX queue %d, but "
2304                                 "real number of TX queues is %d\n",
2305                                 dev->name, queue_index, dev->real_num_tx_queues);
2306                 }
2307                 return 0;
2308         }
2309         return queue_index;
2310 }
2311
2312 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2313 {
2314 #ifdef CONFIG_XPS
2315         struct xps_dev_maps *dev_maps;
2316         struct xps_map *map;
2317         int queue_index = -1;
2318
2319         rcu_read_lock();
2320         dev_maps = rcu_dereference(dev->xps_maps);
2321         if (dev_maps) {
2322                 map = rcu_dereference(
2323                     dev_maps->cpu_map[raw_smp_processor_id()]);
2324                 if (map) {
2325                         if (map->len == 1)
2326                                 queue_index = map->queues[0];
2327                         else {
2328                                 u32 hash;
2329                                 if (skb->sk && skb->sk->sk_hash)
2330                                         hash = skb->sk->sk_hash;
2331                                 else
2332                                         hash = (__force u16) skb->protocol ^
2333                                             skb->rxhash;
2334                                 hash = jhash_1word(hash, hashrnd);
2335                                 queue_index = map->queues[
2336                                     ((u64)hash * map->len) >> 32];
2337                         }
2338                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2339                                 queue_index = -1;
2340                 }
2341         }
2342         rcu_read_unlock();
2343
2344         return queue_index;
2345 #else
2346         return -1;
2347 #endif
2348 }
2349
2350 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2351                                         struct sk_buff *skb)
2352 {
2353         int queue_index;
2354         const struct net_device_ops *ops = dev->netdev_ops;
2355
2356         if (dev->real_num_tx_queues == 1)
2357                 queue_index = 0;
2358         else if (ops->ndo_select_queue) {
2359                 queue_index = ops->ndo_select_queue(dev, skb);
2360                 queue_index = dev_cap_txqueue(dev, queue_index);
2361         } else {
2362                 struct sock *sk = skb->sk;
2363                 queue_index = sk_tx_queue_get(sk);
2364
2365                 if (queue_index < 0 || skb->ooo_okay ||
2366                     queue_index >= dev->real_num_tx_queues) {
2367                         int old_index = queue_index;
2368
2369                         queue_index = get_xps_queue(dev, skb);
2370                         if (queue_index < 0)
2371                                 queue_index = skb_tx_hash(dev, skb);
2372
2373                         if (queue_index != old_index && sk) {
2374                                 struct dst_entry *dst =
2375                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2376
2377                                 if (dst && skb_dst(skb) == dst)
2378                                         sk_tx_queue_set(sk, queue_index);
2379                         }
2380                 }
2381         }
2382
2383         skb_set_queue_mapping(skb, queue_index);
2384         return netdev_get_tx_queue(dev, queue_index);
2385 }
2386
2387 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2388                                  struct net_device *dev,
2389                                  struct netdev_queue *txq)
2390 {
2391         spinlock_t *root_lock = qdisc_lock(q);
2392         bool contended;
2393         int rc;
2394
2395         qdisc_skb_cb(skb)->pkt_len = skb->len;
2396         qdisc_calculate_pkt_len(skb, q);
2397         /*
2398          * Heuristic to force contended enqueues to serialize on a
2399          * separate lock before trying to get qdisc main lock.
2400          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2401          * and dequeue packets faster.
2402          */
2403         contended = qdisc_is_running(q);
2404         if (unlikely(contended))
2405                 spin_lock(&q->busylock);
2406
2407         spin_lock(root_lock);
2408         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2409                 kfree_skb(skb);
2410                 rc = NET_XMIT_DROP;
2411         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2412                    qdisc_run_begin(q)) {
2413                 /*
2414                  * This is a work-conserving queue; there are no old skbs
2415                  * waiting to be sent out; and the qdisc is not running -
2416                  * xmit the skb directly.
2417                  */
2418                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2419                         skb_dst_force(skb);
2420
2421                 qdisc_bstats_update(q, skb);
2422
2423                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2424                         if (unlikely(contended)) {
2425                                 spin_unlock(&q->busylock);
2426                                 contended = false;
2427                         }
2428                         __qdisc_run(q);
2429                 } else
2430                         qdisc_run_end(q);
2431
2432                 rc = NET_XMIT_SUCCESS;
2433         } else {
2434                 skb_dst_force(skb);
2435                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2436                 if (qdisc_run_begin(q)) {
2437                         if (unlikely(contended)) {
2438                                 spin_unlock(&q->busylock);
2439                                 contended = false;
2440                         }
2441                         __qdisc_run(q);
2442                 }
2443         }
2444         spin_unlock(root_lock);
2445         if (unlikely(contended))
2446                 spin_unlock(&q->busylock);
2447         return rc;
2448 }
2449
2450 static DEFINE_PER_CPU(int, xmit_recursion);
2451 #define RECURSION_LIMIT 10
2452
2453 /**
2454  *      dev_queue_xmit - transmit a buffer
2455  *      @skb: buffer to transmit
2456  *
2457  *      Queue a buffer for transmission to a network device. The caller must
2458  *      have set the device and priority and built the buffer before calling
2459  *      this function. The function can be called from an interrupt.
2460  *
2461  *      A negative errno code is returned on a failure. A success does not
2462  *      guarantee the frame will be transmitted as it may be dropped due
2463  *      to congestion or traffic shaping.
2464  *
2465  * -----------------------------------------------------------------------------------
2466  *      I notice this method can also return errors from the queue disciplines,
2467  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2468  *      be positive.
2469  *
2470  *      Regardless of the return value, the skb is consumed, so it is currently
2471  *      difficult to retry a send to this method.  (You can bump the ref count
2472  *      before sending to hold a reference for retry if you are careful.)
2473  *
2474  *      When calling this method, interrupts MUST be enabled.  This is because
2475  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2476  *          --BLG
2477  */
2478 int dev_queue_xmit(struct sk_buff *skb)
2479 {
2480         struct net_device *dev = skb->dev;
2481         struct netdev_queue *txq;
2482         struct Qdisc *q;
2483         int rc = -ENOMEM;
2484
2485         /* Disable soft irqs for various locks below. Also
2486          * stops preemption for RCU.
2487          */
2488         rcu_read_lock_bh();
2489
2490         txq = dev_pick_tx(dev, skb);
2491         q = rcu_dereference_bh(txq->qdisc);
2492
2493 #ifdef CONFIG_NET_CLS_ACT
2494         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2495 #endif
2496         trace_net_dev_queue(skb);
2497         if (q->enqueue) {
2498                 rc = __dev_xmit_skb(skb, q, dev, txq);
2499                 goto out;
2500         }
2501
2502         /* The device has no queue. Common case for software devices:
2503            loopback, all the sorts of tunnels...
2504
2505            Really, it is unlikely that netif_tx_lock protection is necessary
2506            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2507            counters.)
2508            However, it is possible, that they rely on protection
2509            made by us here.
2510
2511            Check this and shot the lock. It is not prone from deadlocks.
2512            Either shot noqueue qdisc, it is even simpler 8)
2513          */
2514         if (dev->flags & IFF_UP) {
2515                 int cpu = smp_processor_id(); /* ok because BHs are off */
2516
2517                 if (txq->xmit_lock_owner != cpu) {
2518
2519                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2520                                 goto recursion_alert;
2521
2522                         HARD_TX_LOCK(dev, txq, cpu);
2523
2524                         if (!netif_tx_queue_stopped(txq)) {
2525                                 __this_cpu_inc(xmit_recursion);
2526                                 rc = dev_hard_start_xmit(skb, dev, txq);
2527                                 __this_cpu_dec(xmit_recursion);
2528                                 if (dev_xmit_complete(rc)) {
2529                                         HARD_TX_UNLOCK(dev, txq);
2530                                         goto out;
2531                                 }
2532                         }
2533                         HARD_TX_UNLOCK(dev, txq);
2534                         if (net_ratelimit())
2535                                 printk(KERN_CRIT "Virtual device %s asks to "
2536                                        "queue packet!\n", dev->name);
2537                 } else {
2538                         /* Recursion is detected! It is possible,
2539                          * unfortunately
2540                          */
2541 recursion_alert:
2542                         if (net_ratelimit())
2543                                 printk(KERN_CRIT "Dead loop on virtual device "
2544                                        "%s, fix it urgently!\n", dev->name);
2545                 }
2546         }
2547
2548         rc = -ENETDOWN;
2549         rcu_read_unlock_bh();
2550
2551         kfree_skb(skb);
2552         return rc;
2553 out:
2554         rcu_read_unlock_bh();
2555         return rc;
2556 }
2557 EXPORT_SYMBOL(dev_queue_xmit);
2558
2559
2560 /*=======================================================================
2561                         Receiver routines
2562   =======================================================================*/
2563
2564 int netdev_max_backlog __read_mostly = 1000;
2565 int netdev_tstamp_prequeue __read_mostly = 1;
2566 int netdev_budget __read_mostly = 300;
2567 int weight_p __read_mostly = 64;            /* old backlog weight */
2568
2569 /* Called with irq disabled */
2570 static inline void ____napi_schedule(struct softnet_data *sd,
2571                                      struct napi_struct *napi)
2572 {
2573         list_add_tail(&napi->poll_list, &sd->poll_list);
2574         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2575 }
2576
2577 /*
2578  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2579  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2580  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2581  * if hash is a canonical 4-tuple hash over transport ports.
2582  */
2583 void __skb_get_rxhash(struct sk_buff *skb)
2584 {
2585         int nhoff, hash = 0, poff;
2586         const struct ipv6hdr *ip6;
2587         const struct iphdr *ip;
2588         const struct vlan_hdr *vlan;
2589         u8 ip_proto;
2590         u32 addr1, addr2;
2591         u16 proto;
2592         union {
2593                 u32 v32;
2594                 u16 v16[2];
2595         } ports;
2596
2597         nhoff = skb_network_offset(skb);
2598         proto = skb->protocol;
2599
2600 again:
2601         switch (proto) {
2602         case __constant_htons(ETH_P_IP):
2603 ip:
2604                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2605                         goto done;
2606
2607                 ip = (const struct iphdr *) (skb->data + nhoff);
2608                 if (ip_is_fragment(ip))
2609                         ip_proto = 0;
2610                 else
2611                         ip_proto = ip->protocol;
2612                 addr1 = (__force u32) ip->saddr;
2613                 addr2 = (__force u32) ip->daddr;
2614                 nhoff += ip->ihl * 4;
2615                 break;
2616         case __constant_htons(ETH_P_IPV6):
2617 ipv6:
2618                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2619                         goto done;
2620
2621                 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2622                 ip_proto = ip6->nexthdr;
2623                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2624                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2625                 nhoff += 40;
2626                 break;
2627         case __constant_htons(ETH_P_8021Q):
2628                 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2629                         goto done;
2630                 vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2631                 proto = vlan->h_vlan_encapsulated_proto;
2632                 nhoff += sizeof(*vlan);
2633                 goto again;
2634         case __constant_htons(ETH_P_PPP_SES):
2635                 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2636                         goto done;
2637                 proto = *((__be16 *) (skb->data + nhoff +
2638                                       sizeof(struct pppoe_hdr)));
2639                 nhoff += PPPOE_SES_HLEN;
2640                 switch (proto) {
2641                 case __constant_htons(PPP_IP):
2642                         goto ip;
2643                 case __constant_htons(PPP_IPV6):
2644                         goto ipv6;
2645                 default:
2646                         goto done;
2647                 }
2648         default:
2649                 goto done;
2650         }
2651
2652         switch (ip_proto) {
2653         case IPPROTO_GRE:
2654                 if (pskb_may_pull(skb, nhoff + 16)) {
2655                         u8 *h = skb->data + nhoff;
2656                         __be16 flags = *(__be16 *)h;
2657
2658                         /*
2659                          * Only look inside GRE if version zero and no
2660                          * routing
2661                          */
2662                         if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2663                                 proto = *(__be16 *)(h + 2);
2664                                 nhoff += 4;
2665                                 if (flags & GRE_CSUM)
2666                                         nhoff += 4;
2667                                 if (flags & GRE_KEY)
2668                                         nhoff += 4;
2669                                 if (flags & GRE_SEQ)
2670                                         nhoff += 4;
2671                                 goto again;
2672                         }
2673                 }
2674                 break;
2675         case IPPROTO_IPIP:
2676                 goto again;
2677         default:
2678                 break;
2679         }
2680
2681         ports.v32 = 0;
2682         poff = proto_ports_offset(ip_proto);
2683         if (poff >= 0) {
2684                 nhoff += poff;
2685                 if (pskb_may_pull(skb, nhoff + 4)) {
2686                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2687                         if (ports.v16[1] < ports.v16[0])
2688                                 swap(ports.v16[0], ports.v16[1]);
2689                         skb->l4_rxhash = 1;
2690                 }
2691         }
2692
2693         /* get a consistent hash (same value on both flow directions) */
2694         if (addr2 < addr1)
2695                 swap(addr1, addr2);
2696
2697         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2698         if (!hash)
2699                 hash = 1;
2700
2701 done:
2702         skb->rxhash = hash;
2703 }
2704 EXPORT_SYMBOL(__skb_get_rxhash);
2705
2706 #ifdef CONFIG_RPS
2707
2708 /* One global table that all flow-based protocols share. */
2709 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2710 EXPORT_SYMBOL(rps_sock_flow_table);
2711
2712 static struct rps_dev_flow *
2713 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2714             struct rps_dev_flow *rflow, u16 next_cpu)
2715 {
2716         if (next_cpu != RPS_NO_CPU) {
2717 #ifdef CONFIG_RFS_ACCEL
2718                 struct netdev_rx_queue *rxqueue;
2719                 struct rps_dev_flow_table *flow_table;
2720                 struct rps_dev_flow *old_rflow;
2721                 u32 flow_id;
2722                 u16 rxq_index;
2723                 int rc;
2724
2725                 /* Should we steer this flow to a different hardware queue? */
2726                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2727                     !(dev->features & NETIF_F_NTUPLE))
2728                         goto out;
2729                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2730                 if (rxq_index == skb_get_rx_queue(skb))
2731                         goto out;
2732
2733                 rxqueue = dev->_rx + rxq_index;
2734                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2735                 if (!flow_table)
2736                         goto out;
2737                 flow_id = skb->rxhash & flow_table->mask;
2738                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2739                                                         rxq_index, flow_id);
2740                 if (rc < 0)
2741                         goto out;
2742                 old_rflow = rflow;
2743                 rflow = &flow_table->flows[flow_id];
2744                 rflow->filter = rc;
2745                 if (old_rflow->filter == rflow->filter)
2746                         old_rflow->filter = RPS_NO_FILTER;
2747         out:
2748 #endif
2749                 rflow->last_qtail =
2750                         per_cpu(softnet_data, next_cpu).input_queue_head;
2751         }
2752
2753         rflow->cpu = next_cpu;
2754         return rflow;
2755 }
2756
2757 /*
2758  * get_rps_cpu is called from netif_receive_skb and returns the target
2759  * CPU from the RPS map of the receiving queue for a given skb.
2760  * rcu_read_lock must be held on entry.
2761  */
2762 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2763                        struct rps_dev_flow **rflowp)
2764 {
2765         struct netdev_rx_queue *rxqueue;
2766         struct rps_map *map;
2767         struct rps_dev_flow_table *flow_table;
2768         struct rps_sock_flow_table *sock_flow_table;
2769         int cpu = -1;
2770         u16 tcpu;
2771
2772         if (skb_rx_queue_recorded(skb)) {
2773                 u16 index = skb_get_rx_queue(skb);
2774                 if (unlikely(index >= dev->real_num_rx_queues)) {
2775                         WARN_ONCE(dev->real_num_rx_queues > 1,
2776                                   "%s received packet on queue %u, but number "
2777                                   "of RX queues is %u\n",
2778                                   dev->name, index, dev->real_num_rx_queues);
2779                         goto done;
2780                 }
2781                 rxqueue = dev->_rx + index;
2782         } else
2783                 rxqueue = dev->_rx;
2784
2785         map = rcu_dereference(rxqueue->rps_map);
2786         if (map) {
2787                 if (map->len == 1 &&
2788                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2789                         tcpu = map->cpus[0];
2790                         if (cpu_online(tcpu))
2791                                 cpu = tcpu;
2792                         goto done;
2793                 }
2794         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2795                 goto done;
2796         }
2797
2798         skb_reset_network_header(skb);
2799         if (!skb_get_rxhash(skb))
2800                 goto done;
2801
2802         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2803         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2804         if (flow_table && sock_flow_table) {
2805                 u16 next_cpu;
2806                 struct rps_dev_flow *rflow;
2807
2808                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2809                 tcpu = rflow->cpu;
2810
2811                 next_cpu = sock_flow_table->ents[skb->rxhash &
2812                     sock_flow_table->mask];
2813
2814                 /*
2815                  * If the desired CPU (where last recvmsg was done) is
2816                  * different from current CPU (one in the rx-queue flow
2817                  * table entry), switch if one of the following holds:
2818                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2819                  *   - Current CPU is offline.
2820                  *   - The current CPU's queue tail has advanced beyond the
2821                  *     last packet that was enqueued using this table entry.
2822                  *     This guarantees that all previous packets for the flow
2823                  *     have been dequeued, thus preserving in order delivery.
2824                  */
2825                 if (unlikely(tcpu != next_cpu) &&
2826                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2827                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2828                       rflow->last_qtail)) >= 0))
2829                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2830
2831                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2832                         *rflowp = rflow;
2833                         cpu = tcpu;
2834                         goto done;
2835                 }
2836         }
2837
2838         if (map) {
2839                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2840
2841                 if (cpu_online(tcpu)) {
2842                         cpu = tcpu;
2843                         goto done;
2844                 }
2845         }
2846
2847 done:
2848         return cpu;
2849 }
2850
2851 #ifdef CONFIG_RFS_ACCEL
2852
2853 /**
2854  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2855  * @dev: Device on which the filter was set
2856  * @rxq_index: RX queue index
2857  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2858  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2859  *
2860  * Drivers that implement ndo_rx_flow_steer() should periodically call
2861  * this function for each installed filter and remove the filters for
2862  * which it returns %true.
2863  */
2864 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2865                          u32 flow_id, u16 filter_id)
2866 {
2867         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2868         struct rps_dev_flow_table *flow_table;
2869         struct rps_dev_flow *rflow;
2870         bool expire = true;
2871         int cpu;
2872
2873         rcu_read_lock();
2874         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2875         if (flow_table && flow_id <= flow_table->mask) {
2876                 rflow = &flow_table->flows[flow_id];
2877                 cpu = ACCESS_ONCE(rflow->cpu);
2878                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2879                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2880                            rflow->last_qtail) <
2881                      (int)(10 * flow_table->mask)))
2882                         expire = false;
2883         }
2884         rcu_read_unlock();
2885         return expire;
2886 }
2887 EXPORT_SYMBOL(rps_may_expire_flow);
2888
2889 #endif /* CONFIG_RFS_ACCEL */
2890
2891 /* Called from hardirq (IPI) context */
2892 static void rps_trigger_softirq(void *data)
2893 {
2894         struct softnet_data *sd = data;
2895
2896         ____napi_schedule(sd, &sd->backlog);
2897         sd->received_rps++;
2898 }
2899
2900 #endif /* CONFIG_RPS */
2901
2902 /*
2903  * Check if this softnet_data structure is another cpu one
2904  * If yes, queue it to our IPI list and return 1
2905  * If no, return 0
2906  */
2907 static int rps_ipi_queued(struct softnet_data *sd)
2908 {
2909 #ifdef CONFIG_RPS
2910         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2911
2912         if (sd != mysd) {
2913                 sd->rps_ipi_next = mysd->rps_ipi_list;
2914                 mysd->rps_ipi_list = sd;
2915
2916                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2917                 return 1;
2918         }
2919 #endif /* CONFIG_RPS */
2920         return 0;
2921 }
2922
2923 /*
2924  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2925  * queue (may be a remote CPU queue).
2926  */
2927 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2928                               unsigned int *qtail)
2929 {
2930         struct softnet_data *sd;
2931         unsigned long flags;
2932
2933         sd = &per_cpu(softnet_data, cpu);
2934
2935         local_irq_save(flags);
2936
2937         rps_lock(sd);
2938         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2939                 if (skb_queue_len(&sd->input_pkt_queue)) {
2940 enqueue:
2941                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2942                         input_queue_tail_incr_save(sd, qtail);
2943                         rps_unlock(sd);
2944                         local_irq_restore(flags);
2945                         return NET_RX_SUCCESS;
2946                 }
2947
2948                 /* Schedule NAPI for backlog device
2949                  * We can use non atomic operation since we own the queue lock
2950                  */
2951                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2952                         if (!rps_ipi_queued(sd))
2953                                 ____napi_schedule(sd, &sd->backlog);
2954                 }
2955                 goto enqueue;
2956         }
2957
2958         sd->dropped++;
2959         rps_unlock(sd);
2960
2961         local_irq_restore(flags);
2962
2963         atomic_long_inc(&skb->dev->rx_dropped);
2964         kfree_skb(skb);
2965         return NET_RX_DROP;
2966 }
2967
2968 /**
2969  *      netif_rx        -       post buffer to the network code
2970  *      @skb: buffer to post
2971  *
2972  *      This function receives a packet from a device driver and queues it for
2973  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2974  *      may be dropped during processing for congestion control or by the
2975  *      protocol layers.
2976  *
2977  *      return values:
2978  *      NET_RX_SUCCESS  (no congestion)
2979  *      NET_RX_DROP     (packet was dropped)
2980  *
2981  */
2982
2983 int netif_rx(struct sk_buff *skb)
2984 {
2985         int ret;
2986
2987         /* if netpoll wants it, pretend we never saw it */
2988         if (netpoll_rx(skb))
2989                 return NET_RX_DROP;
2990
2991         net_timestamp_check(netdev_tstamp_prequeue, skb);
2992
2993         trace_netif_rx(skb);
2994 #ifdef CONFIG_RPS
2995         {
2996                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2997                 int cpu;
2998
2999                 preempt_disable();
3000                 rcu_read_lock();
3001
3002                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3003                 if (cpu < 0)
3004                         cpu = smp_processor_id();
3005
3006                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3007
3008                 rcu_read_unlock();
3009                 preempt_enable();
3010         }
3011 #else
3012         {
3013                 unsigned int qtail;
3014                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3015                 put_cpu();
3016         }
3017 #endif
3018         return ret;
3019 }
3020 EXPORT_SYMBOL(netif_rx);
3021
3022 int netif_rx_ni(struct sk_buff *skb)
3023 {
3024         int err;
3025
3026         preempt_disable();
3027         err = netif_rx(skb);
3028         if (local_softirq_pending())
3029                 do_softirq();
3030         preempt_enable();
3031
3032         return err;
3033 }
3034 EXPORT_SYMBOL(netif_rx_ni);
3035
3036 static void net_tx_action(struct softirq_action *h)
3037 {
3038         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3039
3040         if (sd->completion_queue) {
3041                 struct sk_buff *clist;
3042
3043                 local_irq_disable();
3044                 clist = sd->completion_queue;
3045                 sd->completion_queue = NULL;
3046                 local_irq_enable();
3047
3048                 while (clist) {
3049                         struct sk_buff *skb = clist;
3050                         clist = clist->next;
3051
3052                         WARN_ON(atomic_read(&skb->users));
3053                         trace_kfree_skb(skb, net_tx_action);
3054                         __kfree_skb(skb);
3055                 }
3056         }
3057
3058         if (sd->output_queue) {
3059                 struct Qdisc *head;
3060
3061                 local_irq_disable();
3062                 head = sd->output_queue;
3063                 sd->output_queue = NULL;
3064                 sd->output_queue_tailp = &sd->output_queue;
3065                 local_irq_enable();
3066
3067                 while (head) {
3068                         struct Qdisc *q = head;
3069                         spinlock_t *root_lock;
3070
3071                         head = head->next_sched;
3072
3073                         root_lock = qdisc_lock(q);
3074                         if (spin_trylock(root_lock)) {
3075                                 smp_mb__before_clear_bit();
3076                                 clear_bit(__QDISC_STATE_SCHED,
3077                                           &q->state);
3078                                 qdisc_run(q);
3079                                 spin_unlock(root_lock);
3080                         } else {
3081                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3082                                               &q->state)) {
3083                                         __netif_reschedule(q);
3084                                 } else {
3085                                         smp_mb__before_clear_bit();
3086                                         clear_bit(__QDISC_STATE_SCHED,
3087                                                   &q->state);
3088                                 }
3089                         }
3090                 }
3091         }
3092 }
3093
3094 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3095     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3096 /* This hook is defined here for ATM LANE */
3097 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3098                              unsigned char *addr) __read_mostly;
3099 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3100 #endif
3101
3102 #ifdef CONFIG_NET_CLS_ACT
3103 /* TODO: Maybe we should just force sch_ingress to be compiled in
3104  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3105  * a compare and 2 stores extra right now if we dont have it on
3106  * but have CONFIG_NET_CLS_ACT
3107  * NOTE: This doesn't stop any functionality; if you dont have
3108  * the ingress scheduler, you just can't add policies on ingress.
3109  *
3110  */
3111 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3112 {
3113         struct net_device *dev = skb->dev;
3114         u32 ttl = G_TC_RTTL(skb->tc_verd);
3115         int result = TC_ACT_OK;
3116         struct Qdisc *q;
3117
3118         if (unlikely(MAX_RED_LOOP < ttl++)) {
3119                 if (net_ratelimit())
3120                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3121                                skb->skb_iif, dev->ifindex);
3122                 return TC_ACT_SHOT;
3123         }
3124
3125         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3126         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3127
3128         q = rxq->qdisc;
3129         if (q != &noop_qdisc) {
3130                 spin_lock(qdisc_lock(q));
3131                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3132                         result = qdisc_enqueue_root(skb, q);
3133                 spin_unlock(qdisc_lock(q));
3134         }
3135
3136         return result;
3137 }
3138
3139 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3140                                          struct packet_type **pt_prev,
3141                                          int *ret, struct net_device *orig_dev)
3142 {
3143         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3144
3145         if (!rxq || rxq->qdisc == &noop_qdisc)
3146                 goto out;
3147
3148         if (*pt_prev) {
3149                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3150                 *pt_prev = NULL;
3151         }
3152
3153         switch (ing_filter(skb, rxq)) {
3154         case TC_ACT_SHOT:
3155         case TC_ACT_STOLEN:
3156                 kfree_skb(skb);
3157                 return NULL;
3158         }
3159
3160 out:
3161         skb->tc_verd = 0;
3162         return skb;
3163 }
3164 #endif
3165
3166 /**
3167  *      netdev_rx_handler_register - register receive handler
3168  *      @dev: device to register a handler for
3169  *      @rx_handler: receive handler to register
3170  *      @rx_handler_data: data pointer that is used by rx handler
3171  *
3172  *      Register a receive hander for a device. This handler will then be
3173  *      called from __netif_receive_skb. A negative errno code is returned
3174  *      on a failure.
3175  *
3176  *      The caller must hold the rtnl_mutex.
3177  *
3178  *      For a general description of rx_handler, see enum rx_handler_result.
3179  */
3180 int netdev_rx_handler_register(struct net_device *dev,
3181                                rx_handler_func_t *rx_handler,
3182                                void *rx_handler_data)
3183 {
3184         ASSERT_RTNL();
3185
3186         if (dev->rx_handler)
3187                 return -EBUSY;
3188
3189         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3190         rcu_assign_pointer(dev->rx_handler, rx_handler);
3191
3192         return 0;
3193 }
3194 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3195
3196 /**
3197  *      netdev_rx_handler_unregister - unregister receive handler
3198  *      @dev: device to unregister a handler from
3199  *
3200  *      Unregister a receive hander from a device.
3201  *
3202  *      The caller must hold the rtnl_mutex.
3203  */
3204 void netdev_rx_handler_unregister(struct net_device *dev)
3205 {
3206
3207         ASSERT_RTNL();
3208         RCU_INIT_POINTER(dev->rx_handler, NULL);
3209         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3210 }
3211 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3212
3213 static int __netif_receive_skb(struct sk_buff *skb)
3214 {
3215         struct packet_type *ptype, *pt_prev;
3216         rx_handler_func_t *rx_handler;
3217         struct net_device *orig_dev;
3218         struct net_device *null_or_dev;
3219         bool deliver_exact = false;
3220         int ret = NET_RX_DROP;
3221         __be16 type;
3222
3223         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3224
3225         trace_netif_receive_skb(skb);
3226
3227         /* if we've gotten here through NAPI, check netpoll */
3228         if (netpoll_receive_skb(skb))
3229                 return NET_RX_DROP;
3230
3231         if (!skb->skb_iif)
3232                 skb->skb_iif = skb->dev->ifindex;
3233         orig_dev = skb->dev;
3234
3235         skb_reset_network_header(skb);
3236         skb_reset_transport_header(skb);
3237         skb_reset_mac_len(skb);
3238
3239         pt_prev = NULL;
3240
3241         rcu_read_lock();
3242
3243 another_round:
3244
3245         __this_cpu_inc(softnet_data.processed);
3246
3247         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3248                 skb = vlan_untag(skb);
3249                 if (unlikely(!skb))
3250                         goto out;
3251         }
3252
3253 #ifdef CONFIG_NET_CLS_ACT
3254         if (skb->tc_verd & TC_NCLS) {
3255                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3256                 goto ncls;
3257         }
3258 #endif
3259
3260         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3261                 if (!ptype->dev || ptype->dev == skb->dev) {
3262                         if (pt_prev)
3263                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3264                         pt_prev = ptype;
3265                 }
3266         }
3267
3268 #ifdef CONFIG_NET_CLS_ACT
3269         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3270         if (!skb)
3271                 goto out;
3272 ncls:
3273 #endif
3274
3275         rx_handler = rcu_dereference(skb->dev->rx_handler);
3276         if (vlan_tx_tag_present(skb)) {
3277                 if (pt_prev) {
3278                         ret = deliver_skb(skb, pt_prev, orig_dev);
3279                         pt_prev = NULL;
3280                 }
3281                 if (vlan_do_receive(&skb, !rx_handler))
3282                         goto another_round;
3283                 else if (unlikely(!skb))
3284                         goto out;
3285         }
3286
3287         if (rx_handler) {
3288                 if (pt_prev) {
3289                         ret = deliver_skb(skb, pt_prev, orig_dev);
3290                         pt_prev = NULL;
3291                 }
3292                 switch (rx_handler(&skb)) {
3293                 case RX_HANDLER_CONSUMED:
3294                         goto out;
3295                 case RX_HANDLER_ANOTHER:
3296                         goto another_round;
3297                 case RX_HANDLER_EXACT:
3298                         deliver_exact = true;
3299                 case RX_HANDLER_PASS:
3300                         break;
3301                 default:
3302                         BUG();
3303                 }
3304         }
3305
3306         /* deliver only exact match when indicated */
3307         null_or_dev = deliver_exact ? skb->dev : NULL;
3308
3309         type = skb->protocol;
3310         list_for_each_entry_rcu(ptype,
3311                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3312                 if (ptype->type == type &&
3313                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3314                      ptype->dev == orig_dev)) {
3315                         if (pt_prev)
3316                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3317                         pt_prev = ptype;
3318                 }
3319         }
3320
3321         if (pt_prev) {
3322                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3323         } else {
3324                 atomic_long_inc(&skb->dev->rx_dropped);
3325                 kfree_skb(skb);
3326                 /* Jamal, now you will not able to escape explaining
3327                  * me how you were going to use this. :-)
3328                  */
3329                 ret = NET_RX_DROP;
3330         }
3331
3332 out:
3333         rcu_read_unlock();
3334         return ret;
3335 }
3336
3337 /**
3338  *      netif_receive_skb - process receive buffer from network
3339  *      @skb: buffer to process
3340  *
3341  *      netif_receive_skb() is the main receive data processing function.
3342  *      It always succeeds. The buffer may be dropped during processing
3343  *      for congestion control or by the protocol layers.
3344  *
3345  *      This function may only be called from softirq context and interrupts
3346  *      should be enabled.
3347  *
3348  *      Return values (usually ignored):
3349  *      NET_RX_SUCCESS: no congestion
3350  *      NET_RX_DROP: packet was dropped
3351  */
3352 int netif_receive_skb(struct sk_buff *skb)
3353 {
3354         net_timestamp_check(netdev_tstamp_prequeue, skb);
3355
3356         if (skb_defer_rx_timestamp(skb))
3357                 return NET_RX_SUCCESS;
3358
3359 #ifdef CONFIG_RPS
3360         {
3361                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3362                 int cpu, ret;
3363
3364                 rcu_read_lock();
3365
3366                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3367
3368                 if (cpu >= 0) {
3369                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3370                         rcu_read_unlock();
3371                 } else {
3372                         rcu_read_unlock();
3373                         ret = __netif_receive_skb(skb);
3374                 }
3375
3376                 return ret;
3377         }
3378 #else
3379         return __netif_receive_skb(skb);
3380 #endif
3381 }
3382 EXPORT_SYMBOL(netif_receive_skb);
3383
3384 /* Network device is going away, flush any packets still pending
3385  * Called with irqs disabled.
3386  */
3387 static void flush_backlog(void *arg)
3388 {
3389         struct net_device *dev = arg;
3390         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3391         struct sk_buff *skb, *tmp;
3392
3393         rps_lock(sd);
3394         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3395                 if (skb->dev == dev) {
3396                         __skb_unlink(skb, &sd->input_pkt_queue);
3397                         kfree_skb(skb);
3398                         input_queue_head_incr(sd);
3399                 }
3400         }
3401         rps_unlock(sd);
3402
3403         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3404                 if (skb->dev == dev) {
3405                         __skb_unlink(skb, &sd->process_queue);
3406                         kfree_skb(skb);
3407                         input_queue_head_incr(sd);
3408                 }
3409         }
3410 }
3411
3412 static int napi_gro_complete(struct sk_buff *skb)
3413 {
3414         struct packet_type *ptype;
3415         __be16 type = skb->protocol;
3416         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3417         int err = -ENOENT;
3418
3419         if (NAPI_GRO_CB(skb)->count == 1) {
3420                 skb_shinfo(skb)->gso_size = 0;
3421                 goto out;
3422         }
3423
3424         rcu_read_lock();
3425         list_for_each_entry_rcu(ptype, head, list) {
3426                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3427                         continue;
3428
3429                 err = ptype->gro_complete(skb);
3430                 break;
3431         }
3432         rcu_read_unlock();
3433
3434         if (err) {
3435                 WARN_ON(&ptype->list == head);
3436                 kfree_skb(skb);
3437                 return NET_RX_SUCCESS;
3438         }
3439
3440 out:
3441         return netif_receive_skb(skb);
3442 }
3443
3444 inline void napi_gro_flush(struct napi_struct *napi)
3445 {
3446         struct sk_buff *skb, *next;
3447
3448         for (skb = napi->gro_list; skb; skb = next) {
3449                 next = skb->next;
3450                 skb->next = NULL;
3451                 napi_gro_complete(skb);
3452         }
3453
3454         napi->gro_count = 0;
3455         napi->gro_list = NULL;
3456 }
3457 EXPORT_SYMBOL(napi_gro_flush);
3458
3459 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3460 {
3461         struct sk_buff **pp = NULL;
3462         struct packet_type *ptype;
3463         __be16 type = skb->protocol;
3464         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3465         int same_flow;
3466         int mac_len;
3467         enum gro_result ret;
3468
3469         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3470                 goto normal;
3471
3472         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3473                 goto normal;
3474
3475         rcu_read_lock();
3476         list_for_each_entry_rcu(ptype, head, list) {
3477                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3478                         continue;
3479
3480                 skb_set_network_header(skb, skb_gro_offset(skb));
3481                 mac_len = skb->network_header - skb->mac_header;
3482                 skb->mac_len = mac_len;
3483                 NAPI_GRO_CB(skb)->same_flow = 0;
3484                 NAPI_GRO_CB(skb)->flush = 0;
3485                 NAPI_GRO_CB(skb)->free = 0;
3486
3487                 pp = ptype->gro_receive(&napi->gro_list, skb);
3488                 break;
3489         }
3490         rcu_read_unlock();
3491
3492         if (&ptype->list == head)
3493                 goto normal;
3494
3495         same_flow = NAPI_GRO_CB(skb)->same_flow;
3496         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3497
3498         if (pp) {
3499                 struct sk_buff *nskb = *pp;
3500
3501                 *pp = nskb->next;
3502                 nskb->next = NULL;
3503                 napi_gro_complete(nskb);
3504                 napi->gro_count--;
3505         }
3506
3507         if (same_flow)
3508                 goto ok;
3509
3510         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3511                 goto normal;
3512
3513         napi->gro_count++;
3514         NAPI_GRO_CB(skb)->count = 1;
3515         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3516         skb->next = napi->gro_list;
3517         napi->gro_list = skb;
3518         ret = GRO_HELD;
3519
3520 pull:
3521         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3522                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3523
3524                 BUG_ON(skb->end - skb->tail < grow);
3525
3526                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3527
3528                 skb->tail += grow;
3529                 skb->data_len -= grow;
3530
3531                 skb_shinfo(skb)->frags[0].page_offset += grow;
3532                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3533
3534                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3535                         skb_frag_unref(skb, 0);
3536                         memmove(skb_shinfo(skb)->frags,
3537                                 skb_shinfo(skb)->frags + 1,
3538                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3539                 }
3540         }
3541
3542 ok:
3543         return ret;
3544
3545 normal:
3546         ret = GRO_NORMAL;
3547         goto pull;
3548 }
3549 EXPORT_SYMBOL(dev_gro_receive);
3550
3551 static inline gro_result_t
3552 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3553 {
3554         struct sk_buff *p;
3555
3556         for (p = napi->gro_list; p; p = p->next) {
3557                 unsigned long diffs;
3558
3559                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3560                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3561                 diffs |= compare_ether_header(skb_mac_header(p),
3562                                               skb_gro_mac_header(skb));
3563                 NAPI_GRO_CB(p)->same_flow = !diffs;
3564                 NAPI_GRO_CB(p)->flush = 0;
3565         }
3566
3567         return dev_gro_receive(napi, skb);
3568 }
3569
3570 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3571 {
3572         switch (ret) {
3573         case GRO_NORMAL:
3574                 if (netif_receive_skb(skb))
3575                         ret = GRO_DROP;
3576                 break;
3577
3578         case GRO_DROP:
3579         case GRO_MERGED_FREE:
3580                 kfree_skb(skb);
3581                 break;
3582
3583         case GRO_HELD:
3584         case GRO_MERGED:
3585                 break;
3586         }
3587
3588         return ret;
3589 }
3590 EXPORT_SYMBOL(napi_skb_finish);
3591
3592 void skb_gro_reset_offset(struct sk_buff *skb)
3593 {
3594         NAPI_GRO_CB(skb)->data_offset = 0;
3595         NAPI_GRO_CB(skb)->frag0 = NULL;
3596         NAPI_GRO_CB(skb)->frag0_len = 0;
3597
3598         if (skb->mac_header == skb->tail &&
3599             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3600                 NAPI_GRO_CB(skb)->frag0 =
3601                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3602                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3603         }
3604 }
3605 EXPORT_SYMBOL(skb_gro_reset_offset);
3606
3607 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3608 {
3609         skb_gro_reset_offset(skb);
3610
3611         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3612 }
3613 EXPORT_SYMBOL(napi_gro_receive);
3614
3615 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3616 {
3617         __skb_pull(skb, skb_headlen(skb));
3618         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3619         skb->vlan_tci = 0;
3620         skb->dev = napi->dev;
3621         skb->skb_iif = 0;
3622
3623         napi->skb = skb;
3624 }
3625
3626 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3627 {
3628         struct sk_buff *skb = napi->skb;
3629
3630         if (!skb) {
3631                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3632                 if (skb)
3633                         napi->skb = skb;
3634         }
3635         return skb;
3636 }
3637 EXPORT_SYMBOL(napi_get_frags);
3638
3639 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3640                                gro_result_t ret)
3641 {
3642         switch (ret) {
3643         case GRO_NORMAL:
3644         case GRO_HELD:
3645                 skb->protocol = eth_type_trans(skb, skb->dev);
3646
3647                 if (ret == GRO_HELD)
3648                         skb_gro_pull(skb, -ETH_HLEN);
3649                 else if (netif_receive_skb(skb))
3650                         ret = GRO_DROP;
3651                 break;
3652
3653         case GRO_DROP:
3654         case GRO_MERGED_FREE:
3655                 napi_reuse_skb(napi, skb);
3656                 break;
3657
3658         case GRO_MERGED:
3659                 break;
3660         }
3661
3662         return ret;
3663 }
3664 EXPORT_SYMBOL(napi_frags_finish);
3665
3666 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3667 {
3668         struct sk_buff *skb = napi->skb;
3669         struct ethhdr *eth;
3670         unsigned int hlen;
3671         unsigned int off;
3672
3673         napi->skb = NULL;
3674
3675         skb_reset_mac_header(skb);
3676         skb_gro_reset_offset(skb);
3677
3678         off = skb_gro_offset(skb);
3679         hlen = off + sizeof(*eth);
3680         eth = skb_gro_header_fast(skb, off);
3681         if (skb_gro_header_hard(skb, hlen)) {
3682                 eth = skb_gro_header_slow(skb, hlen, off);
3683                 if (unlikely(!eth)) {
3684                         napi_reuse_skb(napi, skb);
3685                         skb = NULL;
3686                         goto out;
3687                 }
3688         }
3689
3690         skb_gro_pull(skb, sizeof(*eth));
3691
3692         /*
3693          * This works because the only protocols we care about don't require
3694          * special handling.  We'll fix it up properly at the end.
3695          */
3696         skb->protocol = eth->h_proto;
3697
3698 out:
3699         return skb;
3700 }
3701 EXPORT_SYMBOL(napi_frags_skb);
3702
3703 gro_result_t napi_gro_frags(struct napi_struct *napi)
3704 {
3705         struct sk_buff *skb = napi_frags_skb(napi);
3706
3707         if (!skb)
3708                 return GRO_DROP;
3709
3710         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3711 }
3712 EXPORT_SYMBOL(napi_gro_frags);
3713
3714 /*
3715  * net_rps_action sends any pending IPI's for rps.
3716  * Note: called with local irq disabled, but exits with local irq enabled.
3717  */
3718 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3719 {
3720 #ifdef CONFIG_RPS
3721         struct softnet_data *remsd = sd->rps_ipi_list;
3722
3723         if (remsd) {
3724                 sd->rps_ipi_list = NULL;
3725
3726                 local_irq_enable();
3727
3728                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3729                 while (remsd) {
3730                         struct softnet_data *next = remsd->rps_ipi_next;
3731
3732                         if (cpu_online(remsd->cpu))
3733                                 __smp_call_function_single(remsd->cpu,
3734                                                            &remsd->csd, 0);
3735                         remsd = next;
3736                 }
3737         } else
3738 #endif
3739                 local_irq_enable();
3740 }
3741
3742 static int process_backlog(struct napi_struct *napi, int quota)
3743 {
3744         int work = 0;
3745         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3746
3747 #ifdef CONFIG_RPS
3748         /* Check if we have pending ipi, its better to send them now,
3749          * not waiting net_rx_action() end.
3750          */
3751         if (sd->rps_ipi_list) {
3752                 local_irq_disable();
3753                 net_rps_action_and_irq_enable(sd);
3754         }
3755 #endif
3756         napi->weight = weight_p;
3757         local_irq_disable();
3758         while (work < quota) {
3759                 struct sk_buff *skb;
3760                 unsigned int qlen;
3761
3762                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3763                         local_irq_enable();
3764                         __netif_receive_skb(skb);
3765                         local_irq_disable();
3766                         input_queue_head_incr(sd);
3767                         if (++work >= quota) {
3768                                 local_irq_enable();
3769                                 return work;
3770                         }
3771                 }
3772
3773                 rps_lock(sd);
3774                 qlen = skb_queue_len(&sd->input_pkt_queue);
3775                 if (qlen)
3776                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3777                                                    &sd->process_queue);
3778
3779                 if (qlen < quota - work) {
3780                         /*
3781                          * Inline a custom version of __napi_complete().
3782                          * only current cpu owns and manipulates this napi,
3783                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3784                          * we can use a plain write instead of clear_bit(),
3785                          * and we dont need an smp_mb() memory barrier.
3786                          */
3787                         list_del(&napi->poll_list);
3788                         napi->state = 0;
3789
3790                         quota = work + qlen;
3791                 }
3792                 rps_unlock(sd);
3793         }
3794         local_irq_enable();
3795
3796         return work;
3797 }
3798
3799 /**
3800  * __napi_schedule - schedule for receive
3801  * @n: entry to schedule
3802  *
3803  * The entry's receive function will be scheduled to run
3804  */
3805 void __napi_schedule(struct napi_struct *n)
3806 {
3807         unsigned long flags;
3808
3809         local_irq_save(flags);
3810         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3811         local_irq_restore(flags);
3812 }
3813 EXPORT_SYMBOL(__napi_schedule);
3814
3815 void __napi_complete(struct napi_struct *n)
3816 {
3817         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3818         BUG_ON(n->gro_list);
3819
3820         list_del(&n->poll_list);
3821         smp_mb__before_clear_bit();
3822         clear_bit(NAPI_STATE_SCHED, &n->state);
3823 }
3824 EXPORT_SYMBOL(__napi_complete);
3825
3826 void napi_complete(struct napi_struct *n)
3827 {
3828         unsigned long flags;
3829
3830         /*
3831          * don't let napi dequeue from the cpu poll list
3832          * just in case its running on a different cpu
3833          */
3834         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3835                 return;
3836
3837         napi_gro_flush(n);
3838         local_irq_save(flags);
3839         __napi_complete(n);
3840         local_irq_restore(flags);
3841 }
3842 EXPORT_SYMBOL(napi_complete);
3843
3844 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3845                     int (*poll)(struct napi_struct *, int), int weight)
3846 {
3847         INIT_LIST_HEAD(&napi->poll_list);
3848         napi->gro_count = 0;
3849         napi->gro_list = NULL;
3850         napi->skb = NULL;
3851         napi->poll = poll;
3852         napi->weight = weight;
3853         list_add(&napi->dev_list, &dev->napi_list);
3854         napi->dev = dev;
3855 #ifdef CONFIG_NETPOLL
3856         spin_lock_init(&napi->poll_lock);
3857         napi->poll_owner = -1;
3858 #endif
3859         set_bit(NAPI_STATE_SCHED, &napi->state);
3860 }
3861 EXPORT_SYMBOL(netif_napi_add);
3862
3863 void netif_napi_del(struct napi_struct *napi)
3864 {
3865         struct sk_buff *skb, *next;
3866
3867         list_del_init(&napi->dev_list);
3868         napi_free_frags(napi);
3869
3870         for (skb = napi->gro_list; skb; skb = next) {
3871                 next = skb->next;
3872                 skb->next = NULL;
3873                 kfree_skb(skb);
3874         }
3875
3876         napi->gro_list = NULL;
3877         napi->gro_count = 0;
3878 }
3879 EXPORT_SYMBOL(netif_napi_del);
3880
3881 static void net_rx_action(struct softirq_action *h)
3882 {
3883         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3884         unsigned long time_limit = jiffies + 2;
3885         int budget = netdev_budget;
3886         void *have;
3887
3888         local_irq_disable();
3889
3890         while (!list_empty(&sd->poll_list)) {
3891                 struct napi_struct *n;
3892                 int work, weight;
3893
3894                 /* If softirq window is exhuasted then punt.
3895                  * Allow this to run for 2 jiffies since which will allow
3896                  * an average latency of 1.5/HZ.
3897                  */
3898                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3899                         goto softnet_break;
3900
3901                 local_irq_enable();
3902
3903                 /* Even though interrupts have been re-enabled, this
3904                  * access is safe because interrupts can only add new
3905                  * entries to the tail of this list, and only ->poll()
3906                  * calls can remove this head entry from the list.
3907                  */
3908                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3909
3910                 have = netpoll_poll_lock(n);
3911
3912                 weight = n->weight;
3913
3914                 /* This NAPI_STATE_SCHED test is for avoiding a race
3915                  * with netpoll's poll_napi().  Only the entity which
3916                  * obtains the lock and sees NAPI_STATE_SCHED set will
3917                  * actually make the ->poll() call.  Therefore we avoid
3918                  * accidentally calling ->poll() when NAPI is not scheduled.
3919                  */
3920                 work = 0;
3921                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3922                         work = n->poll(n, weight);
3923                         trace_napi_poll(n);
3924                 }
3925
3926                 WARN_ON_ONCE(work > weight);
3927
3928                 budget -= work;
3929
3930                 local_irq_disable();
3931
3932                 /* Drivers must not modify the NAPI state if they
3933                  * consume the entire weight.  In such cases this code
3934                  * still "owns" the NAPI instance and therefore can
3935                  * move the instance around on the list at-will.
3936                  */
3937                 if (unlikely(work == weight)) {
3938                         if (unlikely(napi_disable_pending(n))) {
3939                                 local_irq_enable();
3940                                 napi_complete(n);
3941                                 local_irq_disable();
3942                         } else
3943                                 list_move_tail(&n->poll_list, &sd->poll_list);
3944                 }
3945
3946                 netpoll_poll_unlock(have);
3947         }
3948 out:
3949         net_rps_action_and_irq_enable(sd);
3950
3951 #ifdef CONFIG_NET_DMA
3952         /*
3953          * There may not be any more sk_buffs coming right now, so push
3954          * any pending DMA copies to hardware
3955          */
3956         dma_issue_pending_all();
3957 #endif
3958
3959         return;
3960
3961 softnet_break:
3962         sd->time_squeeze++;
3963         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3964         goto out;
3965 }
3966
3967 static gifconf_func_t *gifconf_list[NPROTO];
3968
3969 /**
3970  *      register_gifconf        -       register a SIOCGIF handler
3971  *      @family: Address family
3972  *      @gifconf: Function handler
3973  *
3974  *      Register protocol dependent address dumping routines. The handler
3975  *      that is passed must not be freed or reused until it has been replaced
3976  *      by another handler.
3977  */
3978 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3979 {
3980         if (family >= NPROTO)
3981                 return -EINVAL;
3982         gifconf_list[family] = gifconf;
3983         return 0;
3984 }
3985 EXPORT_SYMBOL(register_gifconf);
3986
3987
3988 /*
3989  *      Map an interface index to its name (SIOCGIFNAME)
3990  */
3991
3992 /*
3993  *      We need this ioctl for efficient implementation of the
3994  *      if_indextoname() function required by the IPv6 API.  Without
3995  *      it, we would have to search all the interfaces to find a
3996  *      match.  --pb
3997  */
3998
3999 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4000 {
4001         struct net_device *dev;
4002         struct ifreq ifr;
4003
4004         /*
4005          *      Fetch the caller's info block.
4006          */
4007
4008         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4009                 return -EFAULT;
4010
4011         rcu_read_lock();
4012         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4013         if (!dev) {
4014                 rcu_read_unlock();
4015                 return -ENODEV;
4016         }
4017
4018         strcpy(ifr.ifr_name, dev->name);
4019         rcu_read_unlock();
4020
4021         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4022                 return -EFAULT;
4023         return 0;
4024 }
4025
4026 /*
4027  *      Perform a SIOCGIFCONF call. This structure will change
4028  *      size eventually, and there is nothing I can do about it.
4029  *      Thus we will need a 'compatibility mode'.
4030  */
4031
4032 static int dev_ifconf(struct net *net, char __user *arg)
4033 {
4034         struct ifconf ifc;
4035         struct net_device *dev;
4036         char __user *pos;
4037         int len;
4038         int total;
4039         int i;
4040
4041         /*
4042          *      Fetch the caller's info block.
4043          */
4044
4045         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4046                 return -EFAULT;
4047
4048         pos = ifc.ifc_buf;
4049         len = ifc.ifc_len;
4050
4051         /*
4052          *      Loop over the interfaces, and write an info block for each.
4053          */
4054
4055         total = 0;
4056         for_each_netdev(net, dev) {
4057                 for (i = 0; i < NPROTO; i++) {
4058                         if (gifconf_list[i]) {
4059                                 int done;
4060                                 if (!pos)
4061                                         done = gifconf_list[i](dev, NULL, 0);
4062                                 else
4063                                         done = gifconf_list[i](dev, pos + total,
4064                                                                len - total);
4065                                 if (done < 0)
4066                                         return -EFAULT;
4067                                 total += done;
4068                         }
4069                 }
4070         }
4071
4072         /*
4073          *      All done.  Write the updated control block back to the caller.
4074          */
4075         ifc.ifc_len = total;
4076
4077         /*
4078          *      Both BSD and Solaris return 0 here, so we do too.
4079          */
4080         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4081 }
4082
4083 #ifdef CONFIG_PROC_FS
4084
4085 #define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4086
4087 struct dev_iter_state {
4088         struct seq_net_private p;
4089         unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4090 };
4091
4092 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4093 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4094 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4095
4096 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4097 {
4098         struct dev_iter_state *state = seq->private;
4099         struct net *net = seq_file_net(seq);
4100         struct net_device *dev;
4101         struct hlist_node *p;
4102         struct hlist_head *h;
4103         unsigned int count, bucket, offset;
4104
4105         bucket = get_bucket(state->pos);
4106         offset = get_offset(state->pos);
4107         h = &net->dev_name_head[bucket];
4108         count = 0;
4109         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4110                 if (count++ == offset) {
4111                         state->pos = set_bucket_offset(bucket, count);
4112                         return dev;
4113                 }
4114         }
4115
4116         return NULL;
4117 }
4118
4119 static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4120 {
4121         struct dev_iter_state *state = seq->private;
4122         struct net_device *dev;
4123         unsigned int bucket;
4124
4125         bucket = get_bucket(state->pos);
4126         do {
4127                 dev = dev_from_same_bucket(seq);
4128                 if (dev)
4129                         return dev;
4130
4131                 bucket++;
4132                 state->pos = set_bucket_offset(bucket, 0);
4133         } while (bucket < NETDEV_HASHENTRIES);
4134
4135         return NULL;
4136 }
4137
4138 /*
4139  *      This is invoked by the /proc filesystem handler to display a device
4140  *      in detail.
4141  */
4142 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4143         __acquires(RCU)
4144 {
4145         struct dev_iter_state *state = seq->private;
4146
4147         rcu_read_lock();
4148         if (!*pos)
4149                 return SEQ_START_TOKEN;
4150
4151         /* check for end of the hash */
4152         if (state->pos == 0 && *pos > 1)
4153                 return NULL;
4154
4155         return dev_from_new_bucket(seq);
4156 }
4157
4158 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4159 {
4160         struct net_device *dev;
4161
4162         ++*pos;
4163
4164         if (v == SEQ_START_TOKEN)
4165                 return dev_from_new_bucket(seq);
4166
4167         dev = dev_from_same_bucket(seq);
4168         if (dev)
4169                 return dev;
4170
4171         return dev_from_new_bucket(seq);
4172 }
4173
4174 void dev_seq_stop(struct seq_file *seq, void *v)
4175         __releases(RCU)
4176 {
4177         rcu_read_unlock();
4178 }
4179
4180 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4181 {
4182         struct rtnl_link_stats64 temp;
4183         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4184
4185         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4186                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4187                    dev->name, stats->rx_bytes, stats->rx_packets,
4188                    stats->rx_errors,
4189                    stats->rx_dropped + stats->rx_missed_errors,
4190                    stats->rx_fifo_errors,
4191                    stats->rx_length_errors + stats->rx_over_errors +
4192                     stats->rx_crc_errors + stats->rx_frame_errors,
4193                    stats->rx_compressed, stats->multicast,
4194                    stats->tx_bytes, stats->tx_packets,
4195                    stats->tx_errors, stats->tx_dropped,
4196                    stats->tx_fifo_errors, stats->collisions,
4197                    stats->tx_carrier_errors +
4198                     stats->tx_aborted_errors +
4199                     stats->tx_window_errors +
4200                     stats->tx_heartbeat_errors,
4201                    stats->tx_compressed);
4202 }
4203
4204 /*
4205  *      Called from the PROCfs module. This now uses the new arbitrary sized
4206  *      /proc/net interface to create /proc/net/dev
4207  */
4208 static int dev_seq_show(struct seq_file *seq, void *v)
4209 {
4210         if (v == SEQ_START_TOKEN)
4211                 seq_puts(seq, "Inter-|   Receive                            "
4212                               "                    |  Transmit\n"
4213                               " face |bytes    packets errs drop fifo frame "
4214                               "compressed multicast|bytes    packets errs "
4215                               "drop fifo colls carrier compressed\n");
4216         else
4217                 dev_seq_printf_stats(seq, v);
4218         return 0;
4219 }
4220
4221 static struct softnet_data *softnet_get_online(loff_t *pos)
4222 {
4223         struct softnet_data *sd = NULL;
4224
4225         while (*pos < nr_cpu_ids)
4226                 if (cpu_online(*pos)) {
4227                         sd = &per_cpu(softnet_data, *pos);
4228                         break;
4229                 } else
4230                         ++*pos;
4231         return sd;
4232 }
4233
4234 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4235 {
4236         return softnet_get_online(pos);
4237 }
4238
4239 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4240 {
4241         ++*pos;
4242         return softnet_get_online(pos);
4243 }
4244
4245 static void softnet_seq_stop(struct seq_file *seq, void *v)
4246 {
4247 }
4248
4249 static int softnet_seq_show(struct seq_file *seq, void *v)
4250 {
4251         struct softnet_data *sd = v;
4252
4253         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4254                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4255                    0, 0, 0, 0, /* was fastroute */
4256                    sd->cpu_collision, sd->received_rps);
4257         return 0;
4258 }
4259
4260 static const struct seq_operations dev_seq_ops = {
4261         .start = dev_seq_start,
4262         .next  = dev_seq_next,
4263         .stop  = dev_seq_stop,
4264         .show  = dev_seq_show,
4265 };
4266
4267 static int dev_seq_open(struct inode *inode, struct file *file)
4268 {
4269         return seq_open_net(inode, file, &dev_seq_ops,
4270                             sizeof(struct dev_iter_state));
4271 }
4272
4273 static const struct file_operations dev_seq_fops = {
4274         .owner   = THIS_MODULE,
4275         .open    = dev_seq_open,
4276         .read    = seq_read,
4277         .llseek  = seq_lseek,
4278         .release = seq_release_net,
4279 };
4280
4281 static const struct seq_operations softnet_seq_ops = {
4282         .start = softnet_seq_start,
4283         .next  = softnet_seq_next,
4284         .stop  = softnet_seq_stop,
4285         .show  = softnet_seq_show,
4286 };
4287
4288 static int softnet_seq_open(struct inode *inode, struct file *file)
4289 {
4290         return seq_open(file, &softnet_seq_ops);
4291 }
4292
4293 static const struct file_operations softnet_seq_fops = {
4294         .owner   = THIS_MODULE,
4295         .open    = softnet_seq_open,
4296         .read    = seq_read,
4297         .llseek  = seq_lseek,
4298         .release = seq_release,
4299 };
4300
4301 static void *ptype_get_idx(loff_t pos)
4302 {
4303         struct packet_type *pt = NULL;
4304         loff_t i = 0;
4305         int t;
4306
4307         list_for_each_entry_rcu(pt, &ptype_all, list) {
4308                 if (i == pos)
4309                         return pt;
4310                 ++i;
4311         }
4312
4313         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4314                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4315                         if (i == pos)
4316                                 return pt;
4317                         ++i;
4318                 }
4319         }
4320         return NULL;
4321 }
4322
4323 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4324         __acquires(RCU)
4325 {
4326         rcu_read_lock();
4327         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4328 }
4329
4330 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4331 {
4332         struct packet_type *pt;
4333         struct list_head *nxt;
4334         int hash;
4335
4336         ++*pos;
4337         if (v == SEQ_START_TOKEN)
4338                 return ptype_get_idx(0);
4339
4340         pt = v;
4341         nxt = pt->list.next;
4342         if (pt->type == htons(ETH_P_ALL)) {
4343                 if (nxt != &ptype_all)
4344                         goto found;
4345                 hash = 0;
4346                 nxt = ptype_base[0].next;
4347         } else
4348                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4349
4350         while (nxt == &ptype_base[hash]) {
4351                 if (++hash >= PTYPE_HASH_SIZE)
4352                         return NULL;
4353                 nxt = ptype_base[hash].next;
4354         }
4355 found:
4356         return list_entry(nxt, struct packet_type, list);
4357 }
4358
4359 static void ptype_seq_stop(struct seq_file *seq, void *v)
4360         __releases(RCU)
4361 {
4362         rcu_read_unlock();
4363 }
4364
4365 static int ptype_seq_show(struct seq_file *seq, void *v)
4366 {
4367         struct packet_type *pt = v;
4368
4369         if (v == SEQ_START_TOKEN)
4370                 seq_puts(seq, "Type Device      Function\n");
4371         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4372                 if (pt->type == htons(ETH_P_ALL))
4373                         seq_puts(seq, "ALL ");
4374                 else
4375                         seq_printf(seq, "%04x", ntohs(pt->type));
4376
4377                 seq_printf(seq, " %-8s %pF\n",
4378                            pt->dev ? pt->dev->name : "", pt->func);
4379         }
4380
4381         return 0;
4382 }
4383
4384 static const struct seq_operations ptype_seq_ops = {
4385         .start = ptype_seq_start,
4386         .next  = ptype_seq_next,
4387         .stop  = ptype_seq_stop,
4388         .show  = ptype_seq_show,
4389 };
4390
4391 static int ptype_seq_open(struct inode *inode, struct file *file)
4392 {
4393         return seq_open_net(inode, file, &ptype_seq_ops,
4394                         sizeof(struct seq_net_private));
4395 }
4396
4397 static const struct file_operations ptype_seq_fops = {
4398         .owner   = THIS_MODULE,
4399         .open    = ptype_seq_open,
4400         .read    = seq_read,
4401         .llseek  = seq_lseek,
4402         .release = seq_release_net,
4403 };
4404
4405
4406 static int __net_init dev_proc_net_init(struct net *net)
4407 {
4408         int rc = -ENOMEM;
4409
4410         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4411                 goto out;
4412         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4413                 goto out_dev;
4414         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4415                 goto out_softnet;
4416
4417         if (wext_proc_init(net))
4418                 goto out_ptype;
4419         rc = 0;
4420 out:
4421         return rc;
4422 out_ptype:
4423         proc_net_remove(net, "ptype");
4424 out_softnet:
4425         proc_net_remove(net, "softnet_stat");
4426 out_dev:
4427         proc_net_remove(net, "dev");
4428         goto out;
4429 }
4430
4431 static void __net_exit dev_proc_net_exit(struct net *net)
4432 {
4433         wext_proc_exit(net);
4434
4435         proc_net_remove(net, "ptype");
4436         proc_net_remove(net, "softnet_stat");
4437         proc_net_remove(net, "dev");
4438 }
4439
4440 static struct pernet_operations __net_initdata dev_proc_ops = {
4441         .init = dev_proc_net_init,
4442         .exit = dev_proc_net_exit,
4443 };
4444
4445 static int __init dev_proc_init(void)
4446 {
4447         return register_pernet_subsys(&dev_proc_ops);
4448 }
4449 #else
4450 #define dev_proc_init() 0
4451 #endif  /* CONFIG_PROC_FS */
4452
4453
4454 /**
4455  *      netdev_set_master       -       set up master pointer
4456  *      @slave: slave device
4457  *      @master: new master device
4458  *
4459  *      Changes the master device of the slave. Pass %NULL to break the
4460  *      bonding. The caller must hold the RTNL semaphore. On a failure
4461  *      a negative errno code is returned. On success the reference counts
4462  *      are adjusted and the function returns zero.
4463  */
4464 int netdev_set_master(struct net_device *slave, struct net_device *master)
4465 {
4466         struct net_device *old = slave->master;
4467
4468         ASSERT_RTNL();
4469
4470         if (master) {
4471                 if (old)
4472                         return -EBUSY;
4473                 dev_hold(master);
4474         }
4475
4476         slave->master = master;
4477
4478         if (old)
4479                 dev_put(old);
4480         return 0;
4481 }
4482 EXPORT_SYMBOL(netdev_set_master);
4483
4484 /**
4485  *      netdev_set_bond_master  -       set up bonding master/slave pair
4486  *      @slave: slave device
4487  *      @master: new master device
4488  *
4489  *      Changes the master device of the slave. Pass %NULL to break the
4490  *      bonding. The caller must hold the RTNL semaphore. On a failure
4491  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4492  *      to the routing socket and the function returns zero.
4493  */
4494 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4495 {
4496         int err;
4497
4498         ASSERT_RTNL();
4499
4500         err = netdev_set_master(slave, master);
4501         if (err)
4502                 return err;
4503         if (master)
4504                 slave->flags |= IFF_SLAVE;
4505         else
4506                 slave->flags &= ~IFF_SLAVE;
4507
4508         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4509         return 0;
4510 }
4511 EXPORT_SYMBOL(netdev_set_bond_master);
4512
4513 static void dev_change_rx_flags(struct net_device *dev, int flags)
4514 {
4515         const struct net_device_ops *ops = dev->netdev_ops;
4516
4517         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4518                 ops->ndo_change_rx_flags(dev, flags);
4519 }
4520
4521 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4522 {
4523         unsigned short old_flags = dev->flags;
4524         uid_t uid;
4525         gid_t gid;
4526
4527         ASSERT_RTNL();
4528
4529         dev->flags |= IFF_PROMISC;
4530         dev->promiscuity += inc;
4531         if (dev->promiscuity == 0) {
4532                 /*
4533                  * Avoid overflow.
4534                  * If inc causes overflow, untouch promisc and return error.
4535                  */
4536                 if (inc < 0)
4537                         dev->flags &= ~IFF_PROMISC;
4538                 else {
4539                         dev->promiscuity -= inc;
4540                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4541                                 "set promiscuity failed, promiscuity feature "
4542                                 "of device might be broken.\n", dev->name);
4543                         return -EOVERFLOW;
4544                 }
4545         }
4546         if (dev->flags != old_flags) {
4547                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4548                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4549                                                                "left");
4550                 if (audit_enabled) {
4551                         current_uid_gid(&uid, &gid);
4552                         audit_log(current->audit_context, GFP_ATOMIC,
4553                                 AUDIT_ANOM_PROMISCUOUS,
4554                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4555                                 dev->name, (dev->flags & IFF_PROMISC),
4556                                 (old_flags & IFF_PROMISC),
4557                                 audit_get_loginuid(current),
4558                                 uid, gid,
4559                                 audit_get_sessionid(current));
4560                 }
4561
4562                 dev_change_rx_flags(dev, IFF_PROMISC);
4563         }
4564         return 0;
4565 }
4566
4567 /**
4568  *      dev_set_promiscuity     - update promiscuity count on a device
4569  *      @dev: device
4570  *      @inc: modifier
4571  *
4572  *      Add or remove promiscuity from a device. While the count in the device
4573  *      remains above zero the interface remains promiscuous. Once it hits zero
4574  *      the device reverts back to normal filtering operation. A negative inc
4575  *      value is used to drop promiscuity on the device.
4576  *      Return 0 if successful or a negative errno code on error.
4577  */
4578 int dev_set_promiscuity(struct net_device *dev, int inc)
4579 {
4580         unsigned short old_flags = dev->flags;
4581         int err;
4582
4583         err = __dev_set_promiscuity(dev, inc);
4584         if (err < 0)
4585                 return err;
4586         if (dev->flags != old_flags)
4587                 dev_set_rx_mode(dev);
4588         return err;
4589 }
4590 EXPORT_SYMBOL(dev_set_promiscuity);
4591
4592 /**
4593  *      dev_set_allmulti        - update allmulti count on a device
4594  *      @dev: device
4595  *      @inc: modifier
4596  *
4597  *      Add or remove reception of all multicast frames to a device. While the
4598  *      count in the device remains above zero the interface remains listening
4599  *      to all interfaces. Once it hits zero the device reverts back to normal
4600  *      filtering operation. A negative @inc value is used to drop the counter
4601  *      when releasing a resource needing all multicasts.
4602  *      Return 0 if successful or a negative errno code on error.
4603  */
4604
4605 int dev_set_allmulti(struct net_device *dev, int inc)
4606 {
4607         unsigned short old_flags = dev->flags;
4608
4609         ASSERT_RTNL();
4610
4611         dev->flags |= IFF_ALLMULTI;
4612         dev->allmulti += inc;
4613         if (dev->allmulti == 0) {
4614                 /*
4615                  * Avoid overflow.
4616                  * If inc causes overflow, untouch allmulti and return error.
4617                  */
4618                 if (inc < 0)
4619                         dev->flags &= ~IFF_ALLMULTI;
4620                 else {
4621                         dev->allmulti -= inc;
4622                         printk(KERN_WARNING "%s: allmulti touches roof, "
4623                                 "set allmulti failed, allmulti feature of "
4624                                 "device might be broken.\n", dev->name);
4625                         return -EOVERFLOW;
4626                 }
4627         }
4628         if (dev->flags ^ old_flags) {
4629                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4630                 dev_set_rx_mode(dev);
4631         }
4632         return 0;
4633 }
4634 EXPORT_SYMBOL(dev_set_allmulti);
4635
4636 /*
4637  *      Upload unicast and multicast address lists to device and
4638  *      configure RX filtering. When the device doesn't support unicast
4639  *      filtering it is put in promiscuous mode while unicast addresses
4640  *      are present.
4641  */
4642 void __dev_set_rx_mode(struct net_device *dev)
4643 {
4644         const struct net_device_ops *ops = dev->netdev_ops;
4645
4646         /* dev_open will call this function so the list will stay sane. */
4647         if (!(dev->flags&IFF_UP))
4648                 return;
4649
4650         if (!netif_device_present(dev))
4651                 return;
4652
4653         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4654                 /* Unicast addresses changes may only happen under the rtnl,
4655                  * therefore calling __dev_set_promiscuity here is safe.
4656                  */
4657                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4658                         __dev_set_promiscuity(dev, 1);
4659                         dev->uc_promisc = true;
4660                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4661                         __dev_set_promiscuity(dev, -1);
4662                         dev->uc_promisc = false;
4663                 }
4664         }
4665
4666         if (ops->ndo_set_rx_mode)
4667                 ops->ndo_set_rx_mode(dev);
4668 }
4669
4670 void dev_set_rx_mode(struct net_device *dev)
4671 {
4672         netif_addr_lock_bh(dev);
4673         __dev_set_rx_mode(dev);
4674         netif_addr_unlock_bh(dev);
4675 }
4676
4677 /**
4678  *      dev_get_flags - get flags reported to userspace
4679  *      @dev: device
4680  *
4681  *      Get the combination of flag bits exported through APIs to userspace.
4682  */
4683 unsigned dev_get_flags(const struct net_device *dev)
4684 {
4685         unsigned flags;
4686
4687         flags = (dev->flags & ~(IFF_PROMISC |
4688                                 IFF_ALLMULTI |
4689                                 IFF_RUNNING |
4690                                 IFF_LOWER_UP |
4691                                 IFF_DORMANT)) |
4692                 (dev->gflags & (IFF_PROMISC |
4693                                 IFF_ALLMULTI));
4694
4695         if (netif_running(dev)) {
4696                 if (netif_oper_up(dev))
4697                         flags |= IFF_RUNNING;
4698                 if (netif_carrier_ok(dev))
4699                         flags |= IFF_LOWER_UP;
4700                 if (netif_dormant(dev))
4701                         flags |= IFF_DORMANT;
4702         }
4703
4704         return flags;
4705 }
4706 EXPORT_SYMBOL(dev_get_flags);
4707
4708 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4709 {
4710         int old_flags = dev->flags;
4711         int ret;
4712
4713         ASSERT_RTNL();
4714
4715         /*
4716          *      Set the flags on our device.
4717          */
4718
4719         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4720                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4721                                IFF_AUTOMEDIA)) |
4722                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4723                                     IFF_ALLMULTI));
4724
4725         /*
4726          *      Load in the correct multicast list now the flags have changed.
4727          */
4728
4729         if ((old_flags ^ flags) & IFF_MULTICAST)
4730                 dev_change_rx_flags(dev, IFF_MULTICAST);
4731
4732         dev_set_rx_mode(dev);
4733
4734         /*
4735          *      Have we downed the interface. We handle IFF_UP ourselves
4736          *      according to user attempts to set it, rather than blindly
4737          *      setting it.
4738          */
4739
4740         ret = 0;
4741         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4742                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4743
4744                 if (!ret)
4745                         dev_set_rx_mode(dev);
4746         }
4747
4748         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4749                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4750
4751                 dev->gflags ^= IFF_PROMISC;
4752                 dev_set_promiscuity(dev, inc);
4753         }
4754
4755         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4756            is important. Some (broken) drivers set IFF_PROMISC, when
4757            IFF_ALLMULTI is requested not asking us and not reporting.
4758          */
4759         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4760                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4761
4762                 dev->gflags ^= IFF_ALLMULTI;
4763                 dev_set_allmulti(dev, inc);
4764         }
4765
4766         return ret;
4767 }
4768
4769 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4770 {
4771         unsigned int changes = dev->flags ^ old_flags;
4772
4773         if (changes & IFF_UP) {
4774                 if (dev->flags & IFF_UP)
4775                         call_netdevice_notifiers(NETDEV_UP, dev);
4776                 else
4777                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4778         }
4779
4780         if (dev->flags & IFF_UP &&
4781             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4782                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4783 }
4784
4785 /**
4786  *      dev_change_flags - change device settings
4787  *      @dev: device
4788  *      @flags: device state flags
4789  *
4790  *      Change settings on device based state flags. The flags are
4791  *      in the userspace exported format.
4792  */
4793 int dev_change_flags(struct net_device *dev, unsigned flags)
4794 {
4795         int ret, changes;
4796         int old_flags = dev->flags;
4797
4798         ret = __dev_change_flags(dev, flags);
4799         if (ret < 0)
4800                 return ret;
4801
4802         changes = old_flags ^ dev->flags;
4803         if (changes)
4804                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4805
4806         __dev_notify_flags(dev, old_flags);
4807         return ret;
4808 }
4809 EXPORT_SYMBOL(dev_change_flags);
4810
4811 /**
4812  *      dev_set_mtu - Change maximum transfer unit
4813  *      @dev: device
4814  *      @new_mtu: new transfer unit
4815  *
4816  *      Change the maximum transfer size of the network device.
4817  */
4818 int dev_set_mtu(struct net_device *dev, int new_mtu)
4819 {
4820         const struct net_device_ops *ops = dev->netdev_ops;
4821         int err;
4822
4823         if (new_mtu == dev->mtu)
4824                 return 0;
4825
4826         /*      MTU must be positive.    */
4827         if (new_mtu < 0)
4828                 return -EINVAL;
4829
4830         if (!netif_device_present(dev))
4831                 return -ENODEV;
4832
4833         err = 0;
4834         if (ops->ndo_change_mtu)
4835                 err = ops->ndo_change_mtu(dev, new_mtu);
4836         else
4837                 dev->mtu = new_mtu;
4838
4839         if (!err && dev->flags & IFF_UP)
4840                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4841         return err;
4842 }
4843 EXPORT_SYMBOL(dev_set_mtu);
4844
4845 /**
4846  *      dev_set_group - Change group this device belongs to
4847  *      @dev: device
4848  *      @new_group: group this device should belong to
4849  */
4850 void dev_set_group(struct net_device *dev, int new_group)
4851 {
4852         dev->group = new_group;
4853 }
4854 EXPORT_SYMBOL(dev_set_group);
4855
4856 /**
4857  *      dev_set_mac_address - Change Media Access Control Address
4858  *      @dev: device
4859  *      @sa: new address
4860  *
4861  *      Change the hardware (MAC) address of the device
4862  */
4863 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4864 {
4865         const struct net_device_ops *ops = dev->netdev_ops;
4866         int err;
4867
4868         if (!ops->ndo_set_mac_address)
4869                 return -EOPNOTSUPP;
4870         if (sa->sa_family != dev->type)
4871                 return -EINVAL;
4872         if (!netif_device_present(dev))
4873                 return -ENODEV;
4874         err = ops->ndo_set_mac_address(dev, sa);
4875         if (!err)
4876                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4877         return err;
4878 }
4879 EXPORT_SYMBOL(dev_set_mac_address);
4880
4881 /*
4882  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4883  */
4884 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4885 {
4886         int err;
4887         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4888
4889         if (!dev)
4890                 return -ENODEV;
4891
4892         switch (cmd) {
4893         case SIOCGIFFLAGS:      /* Get interface flags */
4894                 ifr->ifr_flags = (short) dev_get_flags(dev);
4895                 return 0;
4896
4897         case SIOCGIFMETRIC:     /* Get the metric on the interface
4898                                    (currently unused) */
4899                 ifr->ifr_metric = 0;
4900                 return 0;
4901
4902         case SIOCGIFMTU:        /* Get the MTU of a device */
4903                 ifr->ifr_mtu = dev->mtu;
4904                 return 0;
4905
4906         case SIOCGIFHWADDR:
4907                 if (!dev->addr_len)
4908                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4909                 else
4910                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4911                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4912                 ifr->ifr_hwaddr.sa_family = dev->type;
4913                 return 0;
4914
4915         case SIOCGIFSLAVE:
4916                 err = -EINVAL;
4917                 break;
4918
4919         case SIOCGIFMAP:
4920                 ifr->ifr_map.mem_start = dev->mem_start;
4921                 ifr->ifr_map.mem_end   = dev->mem_end;
4922                 ifr->ifr_map.base_addr = dev->base_addr;
4923                 ifr->ifr_map.irq       = dev->irq;
4924                 ifr->ifr_map.dma       = dev->dma;
4925                 ifr->ifr_map.port      = dev->if_port;
4926                 return 0;
4927
4928         case SIOCGIFINDEX:
4929                 ifr->ifr_ifindex = dev->ifindex;
4930                 return 0;
4931
4932         case SIOCGIFTXQLEN:
4933                 ifr->ifr_qlen = dev->tx_queue_len;
4934                 return 0;
4935
4936         default:
4937                 /* dev_ioctl() should ensure this case
4938                  * is never reached
4939                  */
4940                 WARN_ON(1);
4941                 err = -ENOTTY;
4942                 break;
4943
4944         }
4945         return err;
4946 }
4947
4948 /*
4949  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4950  */
4951 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4952 {
4953         int err;
4954         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4955         const struct net_device_ops *ops;
4956
4957         if (!dev)
4958                 return -ENODEV;
4959
4960         ops = dev->netdev_ops;
4961
4962         switch (cmd) {
4963         case SIOCSIFFLAGS:      /* Set interface flags */
4964                 return dev_change_flags(dev, ifr->ifr_flags);
4965
4966         case SIOCSIFMETRIC:     /* Set the metric on the interface
4967                                    (currently unused) */
4968                 return -EOPNOTSUPP;
4969
4970         case SIOCSIFMTU:        /* Set the MTU of a device */
4971                 return dev_set_mtu(dev, ifr->ifr_mtu);
4972
4973         case SIOCSIFHWADDR:
4974                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4975
4976         case SIOCSIFHWBROADCAST:
4977                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4978                         return -EINVAL;
4979                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4980                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4981                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4982                 return 0;
4983
4984         case SIOCSIFMAP:
4985                 if (ops->ndo_set_config) {
4986                         if (!netif_device_present(dev))
4987                                 return -ENODEV;
4988                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4989                 }
4990                 return -EOPNOTSUPP;
4991
4992         case SIOCADDMULTI:
4993                 if (!ops->ndo_set_rx_mode ||
4994                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4995                         return -EINVAL;
4996                 if (!netif_device_present(dev))
4997                         return -ENODEV;
4998                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4999
5000         case SIOCDELMULTI:
5001                 if (!ops->ndo_set_rx_mode ||
5002                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5003                         return -EINVAL;
5004                 if (!netif_device_present(dev))
5005                         return -ENODEV;
5006                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5007
5008         case SIOCSIFTXQLEN:
5009                 if (ifr->ifr_qlen < 0)
5010                         return -EINVAL;
5011                 dev->tx_queue_len = ifr->ifr_qlen;
5012                 return 0;
5013
5014         case SIOCSIFNAME:
5015                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5016                 return dev_change_name(dev, ifr->ifr_newname);
5017
5018         case SIOCSHWTSTAMP:
5019                 err = net_hwtstamp_validate(ifr);
5020                 if (err)
5021                         return err;
5022                 /* fall through */
5023
5024         /*
5025          *      Unknown or private ioctl
5026          */
5027         default:
5028                 if ((cmd >= SIOCDEVPRIVATE &&
5029                     cmd <= SIOCDEVPRIVATE + 15) ||
5030                     cmd == SIOCBONDENSLAVE ||
5031                     cmd == SIOCBONDRELEASE ||
5032                     cmd == SIOCBONDSETHWADDR ||
5033                     cmd == SIOCBONDSLAVEINFOQUERY ||
5034                     cmd == SIOCBONDINFOQUERY ||
5035                     cmd == SIOCBONDCHANGEACTIVE ||
5036                     cmd == SIOCGMIIPHY ||
5037                     cmd == SIOCGMIIREG ||
5038                     cmd == SIOCSMIIREG ||
5039                     cmd == SIOCBRADDIF ||
5040                     cmd == SIOCBRDELIF ||
5041                     cmd == SIOCSHWTSTAMP ||
5042                     cmd == SIOCWANDEV) {
5043                         err = -EOPNOTSUPP;
5044                         if (ops->ndo_do_ioctl) {
5045                                 if (netif_device_present(dev))
5046                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5047                                 else
5048                                         err = -ENODEV;
5049                         }
5050                 } else
5051                         err = -EINVAL;
5052
5053         }
5054         return err;
5055 }
5056
5057 /*
5058  *      This function handles all "interface"-type I/O control requests. The actual
5059  *      'doing' part of this is dev_ifsioc above.
5060  */
5061
5062 /**
5063  *      dev_ioctl       -       network device ioctl
5064  *      @net: the applicable net namespace
5065  *      @cmd: command to issue
5066  *      @arg: pointer to a struct ifreq in user space
5067  *
5068  *      Issue ioctl functions to devices. This is normally called by the
5069  *      user space syscall interfaces but can sometimes be useful for
5070  *      other purposes. The return value is the return from the syscall if
5071  *      positive or a negative errno code on error.
5072  */
5073
5074 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5075 {
5076         struct ifreq ifr;
5077         int ret;
5078         char *colon;
5079
5080         /* One special case: SIOCGIFCONF takes ifconf argument
5081            and requires shared lock, because it sleeps writing
5082            to user space.
5083          */
5084
5085         if (cmd == SIOCGIFCONF) {
5086                 rtnl_lock();
5087                 ret = dev_ifconf(net, (char __user *) arg);
5088                 rtnl_unlock();
5089                 return ret;
5090         }
5091         if (cmd == SIOCGIFNAME)
5092                 return dev_ifname(net, (struct ifreq __user *)arg);
5093
5094         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5095                 return -EFAULT;
5096
5097         ifr.ifr_name[IFNAMSIZ-1] = 0;
5098
5099         colon = strchr(ifr.ifr_name, ':');
5100         if (colon)
5101                 *colon = 0;
5102
5103         /*
5104          *      See which interface the caller is talking about.
5105          */
5106
5107         switch (cmd) {
5108         /*
5109          *      These ioctl calls:
5110          *      - can be done by all.
5111          *      - atomic and do not require locking.
5112          *      - return a value
5113          */
5114         case SIOCGIFFLAGS:
5115         case SIOCGIFMETRIC:
5116         case SIOCGIFMTU:
5117         case SIOCGIFHWADDR:
5118         case SIOCGIFSLAVE:
5119         case SIOCGIFMAP:
5120         case SIOCGIFINDEX:
5121         case SIOCGIFTXQLEN:
5122                 dev_load(net, ifr.ifr_name);
5123                 rcu_read_lock();
5124                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5125                 rcu_read_unlock();
5126                 if (!ret) {
5127                         if (colon)
5128                                 *colon = ':';
5129                         if (copy_to_user(arg, &ifr,
5130                                          sizeof(struct ifreq)))
5131                                 ret = -EFAULT;
5132                 }
5133                 return ret;
5134
5135         case SIOCETHTOOL:
5136                 dev_load(net, ifr.ifr_name);
5137                 rtnl_lock();
5138                 ret = dev_ethtool(net, &ifr);
5139                 rtnl_unlock();
5140                 if (!ret) {
5141                         if (colon)
5142                                 *colon = ':';
5143                         if (copy_to_user(arg, &ifr,
5144                                          sizeof(struct ifreq)))
5145                                 ret = -EFAULT;
5146                 }
5147                 return ret;
5148
5149         /*
5150          *      These ioctl calls:
5151          *      - require superuser power.
5152          *      - require strict serialization.
5153          *      - return a value
5154          */
5155         case SIOCGMIIPHY:
5156         case SIOCGMIIREG:
5157         case SIOCSIFNAME:
5158                 if (!capable(CAP_NET_ADMIN))
5159                         return -EPERM;
5160                 dev_load(net, ifr.ifr_name);
5161                 rtnl_lock();
5162                 ret = dev_ifsioc(net, &ifr, cmd);
5163                 rtnl_unlock();
5164                 if (!ret) {
5165                         if (colon)
5166                                 *colon = ':';
5167                         if (copy_to_user(arg, &ifr,
5168                                          sizeof(struct ifreq)))
5169                                 ret = -EFAULT;
5170                 }
5171                 return ret;
5172
5173         /*
5174          *      These ioctl calls:
5175          *      - require superuser power.
5176          *      - require strict serialization.
5177          *      - do not return a value
5178          */
5179         case SIOCSIFFLAGS:
5180         case SIOCSIFMETRIC:
5181         case SIOCSIFMTU:
5182         case SIOCSIFMAP:
5183         case SIOCSIFHWADDR:
5184         case SIOCSIFSLAVE:
5185         case SIOCADDMULTI:
5186         case SIOCDELMULTI:
5187         case SIOCSIFHWBROADCAST:
5188         case SIOCSIFTXQLEN:
5189         case SIOCSMIIREG:
5190         case SIOCBONDENSLAVE:
5191         case SIOCBONDRELEASE:
5192         case SIOCBONDSETHWADDR:
5193         case SIOCBONDCHANGEACTIVE:
5194         case SIOCBRADDIF:
5195         case SIOCBRDELIF:
5196         case SIOCSHWTSTAMP:
5197                 if (!capable(CAP_NET_ADMIN))
5198                         return -EPERM;
5199                 /* fall through */
5200         case SIOCBONDSLAVEINFOQUERY:
5201         case SIOCBONDINFOQUERY:
5202                 dev_load(net, ifr.ifr_name);
5203                 rtnl_lock();
5204                 ret = dev_ifsioc(net, &ifr, cmd);
5205                 rtnl_unlock();
5206                 return ret;
5207
5208         case SIOCGIFMEM:
5209                 /* Get the per device memory space. We can add this but
5210                  * currently do not support it */
5211         case SIOCSIFMEM:
5212                 /* Set the per device memory buffer space.
5213                  * Not applicable in our case */
5214         case SIOCSIFLINK:
5215                 return -ENOTTY;
5216
5217         /*
5218          *      Unknown or private ioctl.
5219          */
5220         default:
5221                 if (cmd == SIOCWANDEV ||
5222                     (cmd >= SIOCDEVPRIVATE &&
5223                      cmd <= SIOCDEVPRIVATE + 15)) {
5224                         dev_load(net, ifr.ifr_name);
5225                         rtnl_lock();
5226                         ret = dev_ifsioc(net, &ifr, cmd);
5227                         rtnl_unlock();
5228                         if (!ret && copy_to_user(arg, &ifr,
5229                                                  sizeof(struct ifreq)))
5230                                 ret = -EFAULT;
5231                         return ret;
5232                 }
5233                 /* Take care of Wireless Extensions */
5234                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5235                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5236                 return -ENOTTY;
5237         }
5238 }
5239
5240
5241 /**
5242  *      dev_new_index   -       allocate an ifindex
5243  *      @net: the applicable net namespace
5244  *
5245  *      Returns a suitable unique value for a new device interface
5246  *      number.  The caller must hold the rtnl semaphore or the
5247  *      dev_base_lock to be sure it remains unique.
5248  */
5249 static int dev_new_index(struct net *net)
5250 {
5251         static int ifindex;
5252         for (;;) {
5253                 if (++ifindex <= 0)
5254                         ifindex = 1;
5255                 if (!__dev_get_by_index(net, ifindex))
5256                         return ifindex;
5257         }
5258 }
5259
5260 /* Delayed registration/unregisteration */
5261 static LIST_HEAD(net_todo_list);
5262
5263 static void net_set_todo(struct net_device *dev)
5264 {
5265         list_add_tail(&dev->todo_list, &net_todo_list);
5266 }
5267
5268 static void rollback_registered_many(struct list_head *head)
5269 {
5270         struct net_device *dev, *tmp;
5271
5272         BUG_ON(dev_boot_phase);
5273         ASSERT_RTNL();
5274
5275         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5276                 /* Some devices call without registering
5277                  * for initialization unwind. Remove those
5278                  * devices and proceed with the remaining.
5279                  */
5280                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5281                         pr_debug("unregister_netdevice: device %s/%p never "
5282                                  "was registered\n", dev->name, dev);
5283
5284                         WARN_ON(1);
5285                         list_del(&dev->unreg_list);
5286                         continue;
5287                 }
5288                 dev->dismantle = true;
5289                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5290         }
5291
5292         /* If device is running, close it first. */
5293         dev_close_many(head);
5294
5295         list_for_each_entry(dev, head, unreg_list) {
5296                 /* And unlink it from device chain. */
5297                 unlist_netdevice(dev);
5298
5299                 dev->reg_state = NETREG_UNREGISTERING;
5300         }
5301
5302         synchronize_net();
5303
5304         list_for_each_entry(dev, head, unreg_list) {
5305                 /* Shutdown queueing discipline. */
5306                 dev_shutdown(dev);
5307
5308
5309                 /* Notify protocols, that we are about to destroy
5310                    this device. They should clean all the things.
5311                 */
5312                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5313
5314                 if (!dev->rtnl_link_ops ||
5315                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5316                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5317
5318                 /*
5319                  *      Flush the unicast and multicast chains
5320                  */
5321                 dev_uc_flush(dev);
5322                 dev_mc_flush(dev);
5323
5324                 if (dev->netdev_ops->ndo_uninit)
5325                         dev->netdev_ops->ndo_uninit(dev);
5326
5327                 /* Notifier chain MUST detach us from master device. */
5328                 WARN_ON(dev->master);
5329
5330                 /* Remove entries from kobject tree */
5331                 netdev_unregister_kobject(dev);
5332         }
5333
5334         /* Process any work delayed until the end of the batch */
5335         dev = list_first_entry(head, struct net_device, unreg_list);
5336         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5337
5338         synchronize_net();
5339
5340         list_for_each_entry(dev, head, unreg_list)
5341                 dev_put(dev);
5342 }
5343
5344 static void rollback_registered(struct net_device *dev)
5345 {
5346         LIST_HEAD(single);
5347
5348         list_add(&dev->unreg_list, &single);
5349         rollback_registered_many(&single);
5350         list_del(&single);
5351 }
5352
5353 static u32 netdev_fix_features(struct net_device *dev, u32 features)
5354 {
5355         /* Fix illegal checksum combinations */
5356         if ((features & NETIF_F_HW_CSUM) &&
5357             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5358                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5359                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5360         }
5361
5362         if ((features & NETIF_F_NO_CSUM) &&
5363             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5364                 netdev_warn(dev, "mixed no checksumming and other settings.\n");
5365                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5366         }
5367
5368         /* Fix illegal SG+CSUM combinations. */
5369         if ((features & NETIF_F_SG) &&
5370             !(features & NETIF_F_ALL_CSUM)) {
5371                 netdev_dbg(dev,
5372                         "Dropping NETIF_F_SG since no checksum feature.\n");
5373                 features &= ~NETIF_F_SG;
5374         }
5375
5376         /* TSO requires that SG is present as well. */
5377         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5378                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5379                 features &= ~NETIF_F_ALL_TSO;
5380         }
5381
5382         /* TSO ECN requires that TSO is present as well. */
5383         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5384                 features &= ~NETIF_F_TSO_ECN;
5385
5386         /* Software GSO depends on SG. */
5387         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5388                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5389                 features &= ~NETIF_F_GSO;
5390         }
5391
5392         /* UFO needs SG and checksumming */
5393         if (features & NETIF_F_UFO) {
5394                 /* maybe split UFO into V4 and V6? */
5395                 if (!((features & NETIF_F_GEN_CSUM) ||
5396                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5397                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5398                         netdev_dbg(dev,
5399                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5400                         features &= ~NETIF_F_UFO;
5401                 }
5402
5403                 if (!(features & NETIF_F_SG)) {
5404                         netdev_dbg(dev,
5405                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5406                         features &= ~NETIF_F_UFO;
5407                 }
5408         }
5409
5410         return features;
5411 }
5412
5413 int __netdev_update_features(struct net_device *dev)
5414 {
5415         u32 features;
5416         int err = 0;
5417
5418         ASSERT_RTNL();
5419
5420         features = netdev_get_wanted_features(dev);
5421
5422         if (dev->netdev_ops->ndo_fix_features)
5423                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5424
5425         /* driver might be less strict about feature dependencies */
5426         features = netdev_fix_features(dev, features);
5427
5428         if (dev->features == features)
5429                 return 0;
5430
5431         netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5432                 dev->features, features);
5433
5434         if (dev->netdev_ops->ndo_set_features)
5435                 err = dev->netdev_ops->ndo_set_features(dev, features);
5436
5437         if (unlikely(err < 0)) {
5438                 netdev_err(dev,
5439                         "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5440                         err, features, dev->features);
5441                 return -1;
5442         }
5443
5444         if (!err)
5445                 dev->features = features;
5446
5447         return 1;
5448 }
5449
5450 /**
5451  *      netdev_update_features - recalculate device features
5452  *      @dev: the device to check
5453  *
5454  *      Recalculate dev->features set and send notifications if it
5455  *      has changed. Should be called after driver or hardware dependent
5456  *      conditions might have changed that influence the features.
5457  */
5458 void netdev_update_features(struct net_device *dev)
5459 {
5460         if (__netdev_update_features(dev))
5461                 netdev_features_change(dev);
5462 }
5463 EXPORT_SYMBOL(netdev_update_features);
5464
5465 /**
5466  *      netdev_change_features - recalculate device features
5467  *      @dev: the device to check
5468  *
5469  *      Recalculate dev->features set and send notifications even
5470  *      if they have not changed. Should be called instead of
5471  *      netdev_update_features() if also dev->vlan_features might
5472  *      have changed to allow the changes to be propagated to stacked
5473  *      VLAN devices.
5474  */
5475 void netdev_change_features(struct net_device *dev)
5476 {
5477         __netdev_update_features(dev);
5478         netdev_features_change(dev);
5479 }
5480 EXPORT_SYMBOL(netdev_change_features);
5481
5482 /**
5483  *      netif_stacked_transfer_operstate -      transfer operstate
5484  *      @rootdev: the root or lower level device to transfer state from
5485  *      @dev: the device to transfer operstate to
5486  *
5487  *      Transfer operational state from root to device. This is normally
5488  *      called when a stacking relationship exists between the root
5489  *      device and the device(a leaf device).
5490  */
5491 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5492                                         struct net_device *dev)
5493 {
5494         if (rootdev->operstate == IF_OPER_DORMANT)
5495                 netif_dormant_on(dev);
5496         else
5497                 netif_dormant_off(dev);
5498
5499         if (netif_carrier_ok(rootdev)) {
5500                 if (!netif_carrier_ok(dev))
5501                         netif_carrier_on(dev);
5502         } else {
5503                 if (netif_carrier_ok(dev))
5504                         netif_carrier_off(dev);
5505         }
5506 }
5507 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5508
5509 #ifdef CONFIG_RPS
5510 static int netif_alloc_rx_queues(struct net_device *dev)
5511 {
5512         unsigned int i, count = dev->num_rx_queues;
5513         struct netdev_rx_queue *rx;
5514
5515         BUG_ON(count < 1);
5516
5517         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5518         if (!rx) {
5519                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5520                 return -ENOMEM;
5521         }
5522         dev->_rx = rx;
5523
5524         for (i = 0; i < count; i++)
5525                 rx[i].dev = dev;
5526         return 0;
5527 }
5528 #endif
5529
5530 static void netdev_init_one_queue(struct net_device *dev,
5531                                   struct netdev_queue *queue, void *_unused)
5532 {
5533         /* Initialize queue lock */
5534         spin_lock_init(&queue->_xmit_lock);
5535         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5536         queue->xmit_lock_owner = -1;
5537         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5538         queue->dev = dev;
5539 }
5540
5541 static int netif_alloc_netdev_queues(struct net_device *dev)
5542 {
5543         unsigned int count = dev->num_tx_queues;
5544         struct netdev_queue *tx;
5545
5546         BUG_ON(count < 1);
5547
5548         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5549         if (!tx) {
5550                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5551                        count);
5552                 return -ENOMEM;
5553         }
5554         dev->_tx = tx;
5555
5556         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5557         spin_lock_init(&dev->tx_global_lock);
5558
5559         return 0;
5560 }
5561
5562 /**
5563  *      register_netdevice      - register a network device
5564  *      @dev: device to register
5565  *
5566  *      Take a completed network device structure and add it to the kernel
5567  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5568  *      chain. 0 is returned on success. A negative errno code is returned
5569  *      on a failure to set up the device, or if the name is a duplicate.
5570  *
5571  *      Callers must hold the rtnl semaphore. You may want
5572  *      register_netdev() instead of this.
5573  *
5574  *      BUGS:
5575  *      The locking appears insufficient to guarantee two parallel registers
5576  *      will not get the same name.
5577  */
5578
5579 int register_netdevice(struct net_device *dev)
5580 {
5581         int ret;
5582         struct net *net = dev_net(dev);
5583
5584         BUG_ON(dev_boot_phase);
5585         ASSERT_RTNL();
5586
5587         might_sleep();
5588
5589         /* When net_device's are persistent, this will be fatal. */
5590         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5591         BUG_ON(!net);
5592
5593         spin_lock_init(&dev->addr_list_lock);
5594         netdev_set_addr_lockdep_class(dev);
5595
5596         dev->iflink = -1;
5597
5598         ret = dev_get_valid_name(dev, dev->name);
5599         if (ret < 0)
5600                 goto out;
5601
5602         /* Init, if this function is available */
5603         if (dev->netdev_ops->ndo_init) {
5604                 ret = dev->netdev_ops->ndo_init(dev);
5605                 if (ret) {
5606                         if (ret > 0)
5607                                 ret = -EIO;
5608                         goto out;
5609                 }
5610         }
5611
5612         dev->ifindex = dev_new_index(net);
5613         if (dev->iflink == -1)
5614                 dev->iflink = dev->ifindex;
5615
5616         /* Transfer changeable features to wanted_features and enable
5617          * software offloads (GSO and GRO).
5618          */
5619         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5620         dev->features |= NETIF_F_SOFT_FEATURES;
5621         dev->wanted_features = dev->features & dev->hw_features;
5622
5623         /* Turn on no cache copy if HW is doing checksum */
5624         dev->hw_features |= NETIF_F_NOCACHE_COPY;
5625         if ((dev->features & NETIF_F_ALL_CSUM) &&
5626             !(dev->features & NETIF_F_NO_CSUM)) {
5627                 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5628                 dev->features |= NETIF_F_NOCACHE_COPY;
5629         }
5630
5631         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5632          */
5633         dev->vlan_features |= NETIF_F_HIGHDMA;
5634
5635         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5636         ret = notifier_to_errno(ret);
5637         if (ret)
5638                 goto err_uninit;
5639
5640         ret = netdev_register_kobject(dev);
5641         if (ret)
5642                 goto err_uninit;
5643         dev->reg_state = NETREG_REGISTERED;
5644
5645         __netdev_update_features(dev);
5646
5647         /*
5648          *      Default initial state at registry is that the
5649          *      device is present.
5650          */
5651
5652         set_bit(__LINK_STATE_PRESENT, &dev->state);
5653
5654         dev_init_scheduler(dev);
5655         dev_hold(dev);
5656         list_netdevice(dev);
5657
5658         /* Notify protocols, that a new device appeared. */
5659         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5660         ret = notifier_to_errno(ret);
5661         if (ret) {
5662                 rollback_registered(dev);
5663                 dev->reg_state = NETREG_UNREGISTERED;
5664         }
5665         /*
5666          *      Prevent userspace races by waiting until the network
5667          *      device is fully setup before sending notifications.
5668          */
5669         if (!dev->rtnl_link_ops ||
5670             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5671                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5672
5673 out:
5674         return ret;
5675
5676 err_uninit:
5677         if (dev->netdev_ops->ndo_uninit)
5678                 dev->netdev_ops->ndo_uninit(dev);
5679         goto out;
5680 }
5681 EXPORT_SYMBOL(register_netdevice);
5682
5683 /**
5684  *      init_dummy_netdev       - init a dummy network device for NAPI
5685  *      @dev: device to init
5686  *
5687  *      This takes a network device structure and initialize the minimum
5688  *      amount of fields so it can be used to schedule NAPI polls without
5689  *      registering a full blown interface. This is to be used by drivers
5690  *      that need to tie several hardware interfaces to a single NAPI
5691  *      poll scheduler due to HW limitations.
5692  */
5693 int init_dummy_netdev(struct net_device *dev)
5694 {
5695         /* Clear everything. Note we don't initialize spinlocks
5696          * are they aren't supposed to be taken by any of the
5697          * NAPI code and this dummy netdev is supposed to be
5698          * only ever used for NAPI polls
5699          */
5700         memset(dev, 0, sizeof(struct net_device));
5701
5702         /* make sure we BUG if trying to hit standard
5703          * register/unregister code path
5704          */
5705         dev->reg_state = NETREG_DUMMY;
5706
5707         /* NAPI wants this */
5708         INIT_LIST_HEAD(&dev->napi_list);
5709
5710         /* a dummy interface is started by default */
5711         set_bit(__LINK_STATE_PRESENT, &dev->state);
5712         set_bit(__LINK_STATE_START, &dev->state);
5713
5714         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5715          * because users of this 'device' dont need to change
5716          * its refcount.
5717          */
5718
5719         return 0;
5720 }
5721 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5722
5723
5724 /**
5725  *      register_netdev - register a network device
5726  *      @dev: device to register
5727  *
5728  *      Take a completed network device structure and add it to the kernel
5729  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5730  *      chain. 0 is returned on success. A negative errno code is returned
5731  *      on a failure to set up the device, or if the name is a duplicate.
5732  *
5733  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5734  *      and expands the device name if you passed a format string to
5735  *      alloc_netdev.
5736  */
5737 int register_netdev(struct net_device *dev)
5738 {
5739         int err;
5740
5741         rtnl_lock();
5742         err = register_netdevice(dev);
5743         rtnl_unlock();
5744         return err;
5745 }
5746 EXPORT_SYMBOL(register_netdev);
5747
5748 int netdev_refcnt_read(const struct net_device *dev)
5749 {
5750         int i, refcnt = 0;
5751
5752         for_each_possible_cpu(i)
5753                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5754         return refcnt;
5755 }
5756 EXPORT_SYMBOL(netdev_refcnt_read);
5757
5758 /*
5759  * netdev_wait_allrefs - wait until all references are gone.
5760  *
5761  * This is called when unregistering network devices.
5762  *
5763  * Any protocol or device that holds a reference should register
5764  * for netdevice notification, and cleanup and put back the
5765  * reference if they receive an UNREGISTER event.
5766  * We can get stuck here if buggy protocols don't correctly
5767  * call dev_put.
5768  */
5769 static void netdev_wait_allrefs(struct net_device *dev)
5770 {
5771         unsigned long rebroadcast_time, warning_time;
5772         int refcnt;
5773
5774         linkwatch_forget_dev(dev);
5775
5776         rebroadcast_time = warning_time = jiffies;
5777         refcnt = netdev_refcnt_read(dev);
5778
5779         while (refcnt != 0) {
5780                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5781                         rtnl_lock();
5782
5783                         /* Rebroadcast unregister notification */
5784                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5785                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5786                          * should have already handle it the first time */
5787
5788                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5789                                      &dev->state)) {
5790                                 /* We must not have linkwatch events
5791                                  * pending on unregister. If this
5792                                  * happens, we simply run the queue
5793                                  * unscheduled, resulting in a noop
5794                                  * for this device.
5795                                  */
5796                                 linkwatch_run_queue();
5797                         }
5798
5799                         __rtnl_unlock();
5800
5801                         rebroadcast_time = jiffies;
5802                 }
5803
5804                 msleep(250);
5805
5806                 refcnt = netdev_refcnt_read(dev);
5807
5808                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5809                         printk(KERN_EMERG "unregister_netdevice: "
5810                                "waiting for %s to become free. Usage "
5811                                "count = %d\n",
5812                                dev->name, refcnt);
5813                         warning_time = jiffies;
5814                 }
5815         }
5816 }
5817
5818 /* The sequence is:
5819  *
5820  *      rtnl_lock();
5821  *      ...
5822  *      register_netdevice(x1);
5823  *      register_netdevice(x2);
5824  *      ...
5825  *      unregister_netdevice(y1);
5826  *      unregister_netdevice(y2);
5827  *      ...
5828  *      rtnl_unlock();
5829  *      free_netdev(y1);
5830  *      free_netdev(y2);
5831  *
5832  * We are invoked by rtnl_unlock().
5833  * This allows us to deal with problems:
5834  * 1) We can delete sysfs objects which invoke hotplug
5835  *    without deadlocking with linkwatch via keventd.
5836  * 2) Since we run with the RTNL semaphore not held, we can sleep
5837  *    safely in order to wait for the netdev refcnt to drop to zero.
5838  *
5839  * We must not return until all unregister events added during
5840  * the interval the lock was held have been completed.
5841  */
5842 void netdev_run_todo(void)
5843 {
5844         struct list_head list;
5845
5846         /* Snapshot list, allow later requests */
5847         list_replace_init(&net_todo_list, &list);
5848
5849         __rtnl_unlock();
5850
5851         /* Wait for rcu callbacks to finish before attempting to drain
5852          * the device list.  This usually avoids a 250ms wait.
5853          */
5854         if (!list_empty(&list))
5855                 rcu_barrier();
5856
5857         while (!list_empty(&list)) {
5858                 struct net_device *dev
5859                         = list_first_entry(&list, struct net_device, todo_list);
5860                 list_del(&dev->todo_list);
5861
5862                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5863                         printk(KERN_ERR "network todo '%s' but state %d\n",
5864                                dev->name, dev->reg_state);
5865                         dump_stack();
5866                         continue;
5867                 }
5868
5869                 dev->reg_state = NETREG_UNREGISTERED;
5870
5871                 on_each_cpu(flush_backlog, dev, 1);
5872
5873                 netdev_wait_allrefs(dev);
5874
5875                 /* paranoia */
5876                 BUG_ON(netdev_refcnt_read(dev));
5877                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5878                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5879                 WARN_ON(dev->dn_ptr);
5880
5881                 if (dev->destructor)
5882                         dev->destructor(dev);
5883
5884                 /* Free network device */
5885                 kobject_put(&dev->dev.kobj);
5886         }
5887 }
5888
5889 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5890  * fields in the same order, with only the type differing.
5891  */
5892 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5893                                     const struct net_device_stats *netdev_stats)
5894 {
5895 #if BITS_PER_LONG == 64
5896         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5897         memcpy(stats64, netdev_stats, sizeof(*stats64));
5898 #else
5899         size_t i, n = sizeof(*stats64) / sizeof(u64);
5900         const unsigned long *src = (const unsigned long *)netdev_stats;
5901         u64 *dst = (u64 *)stats64;
5902
5903         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5904                      sizeof(*stats64) / sizeof(u64));
5905         for (i = 0; i < n; i++)
5906                 dst[i] = src[i];
5907 #endif
5908 }
5909
5910 /**
5911  *      dev_get_stats   - get network device statistics
5912  *      @dev: device to get statistics from
5913  *      @storage: place to store stats
5914  *
5915  *      Get network statistics from device. Return @storage.
5916  *      The device driver may provide its own method by setting
5917  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5918  *      otherwise the internal statistics structure is used.
5919  */
5920 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5921                                         struct rtnl_link_stats64 *storage)
5922 {
5923         const struct net_device_ops *ops = dev->netdev_ops;
5924
5925         if (ops->ndo_get_stats64) {
5926                 memset(storage, 0, sizeof(*storage));
5927                 ops->ndo_get_stats64(dev, storage);
5928         } else if (ops->ndo_get_stats) {
5929                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5930         } else {
5931                 netdev_stats_to_stats64(storage, &dev->stats);
5932         }
5933         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5934         return storage;
5935 }
5936 EXPORT_SYMBOL(dev_get_stats);
5937
5938 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5939 {
5940         struct netdev_queue *queue = dev_ingress_queue(dev);
5941
5942 #ifdef CONFIG_NET_CLS_ACT
5943         if (queue)
5944                 return queue;
5945         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5946         if (!queue)
5947                 return NULL;
5948         netdev_init_one_queue(dev, queue, NULL);
5949         queue->qdisc = &noop_qdisc;
5950         queue->qdisc_sleeping = &noop_qdisc;
5951         rcu_assign_pointer(dev->ingress_queue, queue);
5952 #endif
5953         return queue;
5954 }
5955
5956 /**
5957  *      alloc_netdev_mqs - allocate network device
5958  *      @sizeof_priv:   size of private data to allocate space for
5959  *      @name:          device name format string
5960  *      @setup:         callback to initialize device
5961  *      @txqs:          the number of TX subqueues to allocate
5962  *      @rxqs:          the number of RX subqueues to allocate
5963  *
5964  *      Allocates a struct net_device with private data area for driver use
5965  *      and performs basic initialization.  Also allocates subquue structs
5966  *      for each queue on the device.
5967  */
5968 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5969                 void (*setup)(struct net_device *),
5970                 unsigned int txqs, unsigned int rxqs)
5971 {
5972         struct net_device *dev;
5973         size_t alloc_size;
5974         struct net_device *p;
5975
5976         BUG_ON(strlen(name) >= sizeof(dev->name));
5977
5978         if (txqs < 1) {
5979                 pr_err("alloc_netdev: Unable to allocate device "
5980                        "with zero queues.\n");
5981                 return NULL;
5982         }
5983
5984 #ifdef CONFIG_RPS
5985         if (rxqs < 1) {
5986                 pr_err("alloc_netdev: Unable to allocate device "
5987                        "with zero RX queues.\n");
5988                 return NULL;
5989         }
5990 #endif
5991
5992         alloc_size = sizeof(struct net_device);
5993         if (sizeof_priv) {
5994                 /* ensure 32-byte alignment of private area */
5995                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5996                 alloc_size += sizeof_priv;
5997         }
5998         /* ensure 32-byte alignment of whole construct */
5999         alloc_size += NETDEV_ALIGN - 1;
6000
6001         p = kzalloc(alloc_size, GFP_KERNEL);
6002         if (!p) {
6003                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6004                 return NULL;
6005         }
6006
6007         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6008         dev->padded = (char *)dev - (char *)p;
6009
6010         dev->pcpu_refcnt = alloc_percpu(int);
6011         if (!dev->pcpu_refcnt)
6012                 goto free_p;
6013
6014         if (dev_addr_init(dev))
6015                 goto free_pcpu;
6016
6017         dev_mc_init(dev);
6018         dev_uc_init(dev);
6019
6020         dev_net_set(dev, &init_net);
6021
6022         dev->gso_max_size = GSO_MAX_SIZE;
6023
6024         INIT_LIST_HEAD(&dev->napi_list);
6025         INIT_LIST_HEAD(&dev->unreg_list);
6026         INIT_LIST_HEAD(&dev->link_watch_list);
6027         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6028         setup(dev);
6029
6030         dev->num_tx_queues = txqs;
6031         dev->real_num_tx_queues = txqs;
6032         if (netif_alloc_netdev_queues(dev))
6033                 goto free_all;
6034
6035 #ifdef CONFIG_RPS
6036         dev->num_rx_queues = rxqs;
6037         dev->real_num_rx_queues = rxqs;
6038         if (netif_alloc_rx_queues(dev))
6039                 goto free_all;
6040 #endif
6041
6042         strcpy(dev->name, name);
6043         dev->group = INIT_NETDEV_GROUP;
6044         return dev;
6045
6046 free_all:
6047         free_netdev(dev);
6048         return NULL;
6049
6050 free_pcpu:
6051         free_percpu(dev->pcpu_refcnt);
6052         kfree(dev->_tx);
6053 #ifdef CONFIG_RPS
6054         kfree(dev->_rx);
6055 #endif
6056
6057 free_p:
6058         kfree(p);
6059         return NULL;
6060 }
6061 EXPORT_SYMBOL(alloc_netdev_mqs);
6062
6063 /**
6064  *      free_netdev - free network device
6065  *      @dev: device
6066  *
6067  *      This function does the last stage of destroying an allocated device
6068  *      interface. The reference to the device object is released.
6069  *      If this is the last reference then it will be freed.
6070  */
6071 void free_netdev(struct net_device *dev)
6072 {
6073         struct napi_struct *p, *n;
6074
6075         release_net(dev_net(dev));
6076
6077         kfree(dev->_tx);
6078 #ifdef CONFIG_RPS
6079         kfree(dev->_rx);
6080 #endif
6081
6082         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6083
6084         /* Flush device addresses */
6085         dev_addr_flush(dev);
6086
6087         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6088                 netif_napi_del(p);
6089
6090         free_percpu(dev->pcpu_refcnt);
6091         dev->pcpu_refcnt = NULL;
6092
6093         /*  Compatibility with error handling in drivers */
6094         if (dev->reg_state == NETREG_UNINITIALIZED) {
6095                 kfree((char *)dev - dev->padded);
6096                 return;
6097         }
6098
6099         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6100         dev->reg_state = NETREG_RELEASED;
6101
6102         /* will free via device release */
6103         put_device(&dev->dev);
6104 }
6105 EXPORT_SYMBOL(free_netdev);
6106
6107 /**
6108  *      synchronize_net -  Synchronize with packet receive processing
6109  *
6110  *      Wait for packets currently being received to be done.
6111  *      Does not block later packets from starting.
6112  */
6113 void synchronize_net(void)
6114 {
6115         might_sleep();
6116         if (rtnl_is_locked())
6117                 synchronize_rcu_expedited();
6118         else
6119                 synchronize_rcu();
6120 }
6121 EXPORT_SYMBOL(synchronize_net);
6122
6123 /**
6124  *      unregister_netdevice_queue - remove device from the kernel
6125  *      @dev: device
6126  *      @head: list
6127  *
6128  *      This function shuts down a device interface and removes it
6129  *      from the kernel tables.
6130  *      If head not NULL, device is queued to be unregistered later.
6131  *
6132  *      Callers must hold the rtnl semaphore.  You may want
6133  *      unregister_netdev() instead of this.
6134  */
6135
6136 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6137 {
6138         ASSERT_RTNL();
6139
6140         if (head) {
6141                 list_move_tail(&dev->unreg_list, head);
6142         } else {
6143                 rollback_registered(dev);
6144                 /* Finish processing unregister after unlock */
6145                 net_set_todo(dev);
6146         }
6147 }
6148 EXPORT_SYMBOL(unregister_netdevice_queue);
6149
6150 /**
6151  *      unregister_netdevice_many - unregister many devices
6152  *      @head: list of devices
6153  */
6154 void unregister_netdevice_many(struct list_head *head)
6155 {
6156         struct net_device *dev;
6157
6158         if (!list_empty(head)) {
6159                 rollback_registered_many(head);
6160                 list_for_each_entry(dev, head, unreg_list)
6161                         net_set_todo(dev);
6162         }
6163 }
6164 EXPORT_SYMBOL(unregister_netdevice_many);
6165
6166 /**
6167  *      unregister_netdev - remove device from the kernel
6168  *      @dev: device
6169  *
6170  *      This function shuts down a device interface and removes it
6171  *      from the kernel tables.
6172  *
6173  *      This is just a wrapper for unregister_netdevice that takes
6174  *      the rtnl semaphore.  In general you want to use this and not
6175  *      unregister_netdevice.
6176  */
6177 void unregister_netdev(struct net_device *dev)
6178 {
6179         rtnl_lock();
6180         unregister_netdevice(dev);
6181         rtnl_unlock();
6182 }
6183 EXPORT_SYMBOL(unregister_netdev);
6184
6185 /**
6186  *      dev_change_net_namespace - move device to different nethost namespace
6187  *      @dev: device
6188  *      @net: network namespace
6189  *      @pat: If not NULL name pattern to try if the current device name
6190  *            is already taken in the destination network namespace.
6191  *
6192  *      This function shuts down a device interface and moves it
6193  *      to a new network namespace. On success 0 is returned, on
6194  *      a failure a netagive errno code is returned.
6195  *
6196  *      Callers must hold the rtnl semaphore.
6197  */
6198
6199 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6200 {
6201         int err;
6202
6203         ASSERT_RTNL();
6204
6205         /* Don't allow namespace local devices to be moved. */
6206         err = -EINVAL;
6207         if (dev->features & NETIF_F_NETNS_LOCAL)
6208                 goto out;
6209
6210         /* Ensure the device has been registrered */
6211         err = -EINVAL;
6212         if (dev->reg_state != NETREG_REGISTERED)
6213                 goto out;
6214
6215         /* Get out if there is nothing todo */
6216         err = 0;
6217         if (net_eq(dev_net(dev), net))
6218                 goto out;
6219
6220         /* Pick the destination device name, and ensure
6221          * we can use it in the destination network namespace.
6222          */
6223         err = -EEXIST;
6224         if (__dev_get_by_name(net, dev->name)) {
6225                 /* We get here if we can't use the current device name */
6226                 if (!pat)
6227                         goto out;
6228                 if (dev_get_valid_name(dev, pat) < 0)
6229                         goto out;
6230         }
6231
6232         /*
6233          * And now a mini version of register_netdevice unregister_netdevice.
6234          */
6235
6236         /* If device is running close it first. */
6237         dev_close(dev);
6238
6239         /* And unlink it from device chain */
6240         err = -ENODEV;
6241         unlist_netdevice(dev);
6242
6243         synchronize_net();
6244
6245         /* Shutdown queueing discipline. */
6246         dev_shutdown(dev);
6247
6248         /* Notify protocols, that we are about to destroy
6249            this device. They should clean all the things.
6250
6251            Note that dev->reg_state stays at NETREG_REGISTERED.
6252            This is wanted because this way 8021q and macvlan know
6253            the device is just moving and can keep their slaves up.
6254         */
6255         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6256         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6257         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6258
6259         /*
6260          *      Flush the unicast and multicast chains
6261          */
6262         dev_uc_flush(dev);
6263         dev_mc_flush(dev);
6264
6265         /* Actually switch the network namespace */
6266         dev_net_set(dev, net);
6267
6268         /* If there is an ifindex conflict assign a new one */
6269         if (__dev_get_by_index(net, dev->ifindex)) {
6270                 int iflink = (dev->iflink == dev->ifindex);
6271                 dev->ifindex = dev_new_index(net);
6272                 if (iflink)
6273                         dev->iflink = dev->ifindex;
6274         }
6275
6276         /* Fixup kobjects */
6277         err = device_rename(&dev->dev, dev->name);
6278         WARN_ON(err);
6279
6280         /* Add the device back in the hashes */
6281         list_netdevice(dev);
6282
6283         /* Notify protocols, that a new device appeared. */
6284         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6285
6286         /*
6287          *      Prevent userspace races by waiting until the network
6288          *      device is fully setup before sending notifications.
6289          */
6290         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6291
6292         synchronize_net();
6293         err = 0;
6294 out:
6295         return err;
6296 }
6297 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6298
6299 static int dev_cpu_callback(struct notifier_block *nfb,
6300                             unsigned long action,
6301                             void *ocpu)
6302 {
6303         struct sk_buff **list_skb;
6304         struct sk_buff *skb;
6305         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6306         struct softnet_data *sd, *oldsd;
6307
6308         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6309                 return NOTIFY_OK;
6310
6311         local_irq_disable();
6312         cpu = smp_processor_id();
6313         sd = &per_cpu(softnet_data, cpu);
6314         oldsd = &per_cpu(softnet_data, oldcpu);
6315
6316         /* Find end of our completion_queue. */
6317         list_skb = &sd->completion_queue;
6318         while (*list_skb)
6319                 list_skb = &(*list_skb)->next;
6320         /* Append completion queue from offline CPU. */
6321         *list_skb = oldsd->completion_queue;
6322         oldsd->completion_queue = NULL;
6323
6324         /* Append output queue from offline CPU. */
6325         if (oldsd->output_queue) {
6326                 *sd->output_queue_tailp = oldsd->output_queue;
6327                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6328                 oldsd->output_queue = NULL;
6329                 oldsd->output_queue_tailp = &oldsd->output_queue;
6330         }
6331         /* Append NAPI poll list from offline CPU. */
6332         if (!list_empty(&oldsd->poll_list)) {
6333                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6334                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6335         }
6336
6337         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6338         local_irq_enable();
6339
6340         /* Process offline CPU's input_pkt_queue */
6341         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6342                 netif_rx(skb);
6343                 input_queue_head_incr(oldsd);
6344         }
6345         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6346                 netif_rx(skb);
6347                 input_queue_head_incr(oldsd);
6348         }
6349
6350         return NOTIFY_OK;
6351 }
6352
6353
6354 /**
6355  *      netdev_increment_features - increment feature set by one
6356  *      @all: current feature set
6357  *      @one: new feature set
6358  *      @mask: mask feature set
6359  *
6360  *      Computes a new feature set after adding a device with feature set
6361  *      @one to the master device with current feature set @all.  Will not
6362  *      enable anything that is off in @mask. Returns the new feature set.
6363  */
6364 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6365 {
6366         if (mask & NETIF_F_GEN_CSUM)
6367                 mask |= NETIF_F_ALL_CSUM;
6368         mask |= NETIF_F_VLAN_CHALLENGED;
6369
6370         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6371         all &= one | ~NETIF_F_ALL_FOR_ALL;
6372
6373         /* If device needs checksumming, downgrade to it. */
6374         if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6375                 all &= ~NETIF_F_NO_CSUM;
6376
6377         /* If one device supports hw checksumming, set for all. */
6378         if (all & NETIF_F_GEN_CSUM)
6379                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6380
6381         return all;
6382 }
6383 EXPORT_SYMBOL(netdev_increment_features);
6384
6385 static struct hlist_head *netdev_create_hash(void)
6386 {
6387         int i;
6388         struct hlist_head *hash;
6389
6390         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6391         if (hash != NULL)
6392                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6393                         INIT_HLIST_HEAD(&hash[i]);
6394
6395         return hash;
6396 }
6397
6398 /* Initialize per network namespace state */
6399 static int __net_init netdev_init(struct net *net)
6400 {
6401         INIT_LIST_HEAD(&net->dev_base_head);
6402
6403         net->dev_name_head = netdev_create_hash();
6404         if (net->dev_name_head == NULL)
6405                 goto err_name;
6406
6407         net->dev_index_head = netdev_create_hash();
6408         if (net->dev_index_head == NULL)
6409                 goto err_idx;
6410
6411         return 0;
6412
6413 err_idx:
6414         kfree(net->dev_name_head);
6415 err_name:
6416         return -ENOMEM;
6417 }
6418
6419 /**
6420  *      netdev_drivername - network driver for the device
6421  *      @dev: network device
6422  *
6423  *      Determine network driver for device.
6424  */
6425 const char *netdev_drivername(const struct net_device *dev)
6426 {
6427         const struct device_driver *driver;
6428         const struct device *parent;
6429         const char *empty = "";
6430
6431         parent = dev->dev.parent;
6432         if (!parent)
6433                 return empty;
6434
6435         driver = parent->driver;
6436         if (driver && driver->name)
6437                 return driver->name;
6438         return empty;
6439 }
6440
6441 int __netdev_printk(const char *level, const struct net_device *dev,
6442                            struct va_format *vaf)
6443 {
6444         int r;
6445
6446         if (dev && dev->dev.parent)
6447                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6448                                netdev_name(dev), vaf);
6449         else if (dev)
6450                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6451         else
6452                 r = printk("%s(NULL net_device): %pV", level, vaf);
6453
6454         return r;
6455 }
6456 EXPORT_SYMBOL(__netdev_printk);
6457
6458 int netdev_printk(const char *level, const struct net_device *dev,
6459                   const char *format, ...)
6460 {
6461         struct va_format vaf;
6462         va_list args;
6463         int r;
6464
6465         va_start(args, format);
6466
6467         vaf.fmt = format;
6468         vaf.va = &args;
6469
6470         r = __netdev_printk(level, dev, &vaf);
6471         va_end(args);
6472
6473         return r;
6474 }
6475 EXPORT_SYMBOL(netdev_printk);
6476
6477 #define define_netdev_printk_level(func, level)                 \
6478 int func(const struct net_device *dev, const char *fmt, ...)    \
6479 {                                                               \
6480         int r;                                                  \
6481         struct va_format vaf;                                   \
6482         va_list args;                                           \
6483                                                                 \
6484         va_start(args, fmt);                                    \
6485                                                                 \
6486         vaf.fmt = fmt;                                          \
6487         vaf.va = &args;                                         \
6488                                                                 \
6489         r = __netdev_printk(level, dev, &vaf);                  \
6490         va_end(args);                                           \
6491                                                                 \
6492         return r;                                               \
6493 }                                                               \
6494 EXPORT_SYMBOL(func);
6495
6496 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6497 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6498 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6499 define_netdev_printk_level(netdev_err, KERN_ERR);
6500 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6501 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6502 define_netdev_printk_level(netdev_info, KERN_INFO);
6503
6504 static void __net_exit netdev_exit(struct net *net)
6505 {
6506         kfree(net->dev_name_head);
6507         kfree(net->dev_index_head);
6508 }
6509
6510 static struct pernet_operations __net_initdata netdev_net_ops = {
6511         .init = netdev_init,
6512         .exit = netdev_exit,
6513 };
6514
6515 static void __net_exit default_device_exit(struct net *net)
6516 {
6517         struct net_device *dev, *aux;
6518         /*
6519          * Push all migratable network devices back to the
6520          * initial network namespace
6521          */
6522         rtnl_lock();
6523         for_each_netdev_safe(net, dev, aux) {
6524                 int err;
6525                 char fb_name[IFNAMSIZ];
6526
6527                 /* Ignore unmoveable devices (i.e. loopback) */
6528                 if (dev->features & NETIF_F_NETNS_LOCAL)
6529                         continue;
6530
6531                 /* Leave virtual devices for the generic cleanup */
6532                 if (dev->rtnl_link_ops)
6533                         continue;
6534
6535                 /* Push remaining network devices to init_net */
6536                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6537                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6538                 if (err) {
6539                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6540                                 __func__, dev->name, err);
6541                         BUG();
6542                 }
6543         }
6544         rtnl_unlock();
6545 }
6546
6547 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6548 {
6549         /* At exit all network devices most be removed from a network
6550          * namespace.  Do this in the reverse order of registration.
6551          * Do this across as many network namespaces as possible to
6552          * improve batching efficiency.
6553          */
6554         struct net_device *dev;
6555         struct net *net;
6556         LIST_HEAD(dev_kill_list);
6557
6558         rtnl_lock();
6559         list_for_each_entry(net, net_list, exit_list) {
6560                 for_each_netdev_reverse(net, dev) {
6561                         if (dev->rtnl_link_ops)
6562                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6563                         else
6564                                 unregister_netdevice_queue(dev, &dev_kill_list);
6565                 }
6566         }
6567         unregister_netdevice_many(&dev_kill_list);
6568         list_del(&dev_kill_list);
6569         rtnl_unlock();
6570 }
6571
6572 static struct pernet_operations __net_initdata default_device_ops = {
6573         .exit = default_device_exit,
6574         .exit_batch = default_device_exit_batch,
6575 };
6576
6577 /*
6578  *      Initialize the DEV module. At boot time this walks the device list and
6579  *      unhooks any devices that fail to initialise (normally hardware not
6580  *      present) and leaves us with a valid list of present and active devices.
6581  *
6582  */
6583
6584 /*
6585  *       This is called single threaded during boot, so no need
6586  *       to take the rtnl semaphore.
6587  */
6588 static int __init net_dev_init(void)
6589 {
6590         int i, rc = -ENOMEM;
6591
6592         BUG_ON(!dev_boot_phase);
6593
6594         if (dev_proc_init())
6595                 goto out;
6596
6597         if (netdev_kobject_init())
6598                 goto out;
6599
6600         INIT_LIST_HEAD(&ptype_all);
6601         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6602                 INIT_LIST_HEAD(&ptype_base[i]);
6603
6604         if (register_pernet_subsys(&netdev_net_ops))
6605                 goto out;
6606
6607         /*
6608          *      Initialise the packet receive queues.
6609          */
6610
6611         for_each_possible_cpu(i) {
6612                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6613
6614                 memset(sd, 0, sizeof(*sd));
6615                 skb_queue_head_init(&sd->input_pkt_queue);
6616                 skb_queue_head_init(&sd->process_queue);
6617                 sd->completion_queue = NULL;
6618                 INIT_LIST_HEAD(&sd->poll_list);
6619                 sd->output_queue = NULL;
6620                 sd->output_queue_tailp = &sd->output_queue;
6621 #ifdef CONFIG_RPS
6622                 sd->csd.func = rps_trigger_softirq;
6623                 sd->csd.info = sd;
6624                 sd->csd.flags = 0;
6625                 sd->cpu = i;
6626 #endif
6627
6628                 sd->backlog.poll = process_backlog;
6629                 sd->backlog.weight = weight_p;
6630                 sd->backlog.gro_list = NULL;
6631                 sd->backlog.gro_count = 0;
6632         }
6633
6634         dev_boot_phase = 0;
6635
6636         /* The loopback device is special if any other network devices
6637          * is present in a network namespace the loopback device must
6638          * be present. Since we now dynamically allocate and free the
6639          * loopback device ensure this invariant is maintained by
6640          * keeping the loopback device as the first device on the
6641          * list of network devices.  Ensuring the loopback devices
6642          * is the first device that appears and the last network device
6643          * that disappears.
6644          */
6645         if (register_pernet_device(&loopback_net_ops))
6646                 goto out;
6647
6648         if (register_pernet_device(&default_device_ops))
6649                 goto out;
6650
6651         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6652         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6653
6654         hotcpu_notifier(dev_cpu_callback, 0);
6655         dst_init();
6656         dev_mcast_init();
6657         rc = 0;
6658 out:
6659         return rc;
6660 }
6661
6662 subsys_initcall(net_dev_init);
6663
6664 static int __init initialize_hashrnd(void)
6665 {
6666         get_random_bytes(&hashrnd, sizeof(hashrnd));
6667         return 0;
6668 }
6669
6670 late_initcall_sync(initialize_hashrnd);
6671