net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130
 131 #include "net-sysfs.h"
 132
 133 /* Instead of increasing this, you should create a hash table. */
 134 #define MAX_GRO_SKBS 8
 135
 136 /* This should be increased if a protocol with a bigger head is added. */
 137 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 138
 139 /*
 140  *      The list of packet types we will receive (as opposed to discard)
 141  *      and the routines to invoke.
 142  *
 143  *      Why 16. Because with 16 the only overlap we get on a hash of the
 144  *      low nibble of the protocol value is RARP/SNAP/X.25.
 145  *
 146  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 147  *             sure which should go first, but I bet it won't make much
 148  *             difference if we are running VLANs.  The good news is that
 149  *             this protocol won't be in the list unless compiled in, so
 150  *             the average user (w/out VLANs) will not be adversely affected.
 151  *             --BLG
 152  *
 153  *              0800    IP
 154  *              8100    802.1Q VLAN
 155  *              0001    802.3
 156  *              0002    AX.25
 157  *              0004    802.2
 158  *              8035    RARP
 159  *              0005    SNAP
 160  *              0805    X.25
 161  *              0806    ARP
 162  *              8137    IPX
 163  *              0009    Localtalk
 164  *              86DD    IPv6
 165  */
 166
 167 #define PTYPE_HASH_SIZE (16)
 168 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 169
 170 static DEFINE_SPINLOCK(ptype_lock);
 171 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 172 static struct list_head ptype_all __read_mostly;        /* Taps */
 173
 174 /*
 175  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 176  * semaphore.
 177  *
 178  * Pure readers hold dev_base_lock for reading.
 179  *
 180  * Writers must hold the rtnl semaphore while they loop through the
 181  * dev_base_head list, and hold dev_base_lock for writing when they do the
 182  * actual updates.  This allows pure readers to access the list even
 183  * while a writer is preparing to update it.
 184  *
 185  * To put it another way, dev_base_lock is held for writing only to
 186  * protect against pure readers; the rtnl semaphore provides the
 187  * protection against other writers.
 188  *
 189  * See, for example usages, register_netdevice() and
 190  * unregister_netdevice(), which must be called with the rtnl
 191  * semaphore held.
 192  */
 193 DEFINE_RWLOCK(dev_base_lock);
 194 EXPORT_SYMBOL(dev_base_lock);
 195
 196 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 197 {
 198         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 199         return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)];
 200 }
 201
 202 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 203 {
 204         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 205 }
 206
 207 /* Device list insertion */
 208 static int list_netdevice(struct net_device *dev)
 209 {
 210         struct net *net = dev_net(dev);
 211
 212         ASSERT_RTNL();
 213
 214         write_lock_bh(&dev_base_lock);
 215         list_add_tail(&dev->dev_list, &net->dev_base_head);
 216         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 217         hlist_add_head_rcu(&dev->index_hlist,
 218                            dev_index_hash(net, dev->ifindex));
 219         write_unlock_bh(&dev_base_lock);
 220         return 0;
 221 }
 222
 223 /* Device list removal
 224  * caller must respect a RCU grace period before freeing/reusing dev
 225  */
 226 static void unlist_netdevice(struct net_device *dev)
 227 {
 228         ASSERT_RTNL();
 229
 230         /* Unlink dev from the device chain */
 231         write_lock_bh(&dev_base_lock);
 232         list_del(&dev->dev_list);
 233         hlist_del(&dev->name_hlist);
 234         hlist_del_rcu(&dev->index_hlist);
 235         write_unlock_bh(&dev_base_lock);
 236 }
 237
 238 /*
 239  *      Our notifier list
 240  */
 241
 242 static RAW_NOTIFIER_HEAD(netdev_chain);
 243
 244 /*
 245  *      Device drivers call our routines to queue packets here. We empty the
 246  *      queue in the local softnet handler.
 247  */
 248
 249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 250 EXPORT_PER_CPU_SYMBOL(softnet_data);
 251
 252 #ifdef CONFIG_LOCKDEP
 253 /*
 254  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 255  * according to dev->type
 256  */
 257 static const unsigned short netdev_lock_type[] =
 258         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 259          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 260          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 261          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 262          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 263          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 264          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 265          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 266          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 267          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 268          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 269          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 270          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 271          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 272          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 273          ARPHRD_VOID, ARPHRD_NONE};
 274
 275 static const char *const netdev_lock_name[] =
 276         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 277          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 278          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 279          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 280          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 281          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 282          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 283          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 284          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 285          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 286          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 287          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 288          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 289          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 290          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 291          "_xmit_VOID", "_xmit_NONE"};
 292
 293 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 294 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 295
 296 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 297 {
 298         int i;
 299
 300         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 301                 if (netdev_lock_type[i] == dev_type)
 302                         return i;
 303         /* the last key is used by default */
 304         return ARRAY_SIZE(netdev_lock_type) - 1;
 305 }
 306
 307 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 308                                                  unsigned short dev_type)
 309 {
 310         int i;
 311
 312         i = netdev_lock_pos(dev_type);
 313         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 314                                    netdev_lock_name[i]);
 315 }
 316
 317 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 318 {
 319         int i;
 320
 321         i = netdev_lock_pos(dev->type);
 322         lockdep_set_class_and_name(&dev->addr_list_lock,
 323                                    &netdev_addr_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326 #else
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330 }
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333 }
 334 #endif
 335
 336 /*******************************************************************************
 337
 338                 Protocol management and registration routines
 339
 340 *******************************************************************************/
 341
 342 /*
 343  *      Add a protocol ID to the list. Now that the input handler is
 344  *      smarter we can dispense with all the messy stuff that used to be
 345  *      here.
 346  *
 347  *      BEWARE!!! Protocol handlers, mangling input packets,
 348  *      MUST BE last in hash buckets and checking protocol handlers
 349  *      MUST start from promiscuous ptype_all chain in net_bh.
 350  *      It is true now, do not change it.
 351  *      Explanation follows: if protocol handler, mangling packet, will
 352  *      be the first on list, it is not able to sense, that packet
 353  *      is cloned and should be copied-on-write, so that it will
 354  *      change it and subsequent readers will get broken packet.
 355  *                                                      --ANK (980803)
 356  */
 357
 358 /**
 359  *      dev_add_pack - add packet handler
 360  *      @pt: packet type declaration
 361  *
 362  *      Add a protocol handler to the networking stack. The passed &packet_type
 363  *      is linked into kernel lists and may not be freed until it has been
 364  *      removed from the kernel lists.
 365  *
 366  *      This call does not sleep therefore it can not
 367  *      guarantee all CPU's that are in middle of receiving packets
 368  *      will see the new packet type (until the next received packet).
 369  */
 370
 371 void dev_add_pack(struct packet_type *pt)
 372 {
 373         int hash;
 374
 375         spin_lock_bh(&ptype_lock);
 376         if (pt->type == htons(ETH_P_ALL))
 377                 list_add_rcu(&pt->list, &ptype_all);
 378         else {
 379                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 380                 list_add_rcu(&pt->list, &ptype_base[hash]);
 381         }
 382         spin_unlock_bh(&ptype_lock);
 383 }
 384 EXPORT_SYMBOL(dev_add_pack);
 385
 386 /**
 387  *      __dev_remove_pack        - remove packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Remove a protocol handler that was previously added to the kernel
 391  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 392  *      from the kernel lists and can be freed or reused once this function
 393  *      returns.
 394  *
 395  *      The packet type might still be in use by receivers
 396  *      and must not be freed until after all the CPU's have gone
 397  *      through a quiescent state.
 398  */
 399 void __dev_remove_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head;
 402         struct packet_type *pt1;
 403
 404         spin_lock_bh(&ptype_lock);
 405
 406         if (pt->type == htons(ETH_P_ALL))
 407                 head = &ptype_all;
 408         else
 409                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 410
 411         list_for_each_entry(pt1, head, list) {
 412                 if (pt == pt1) {
 413                         list_del_rcu(&pt->list);
 414                         goto out;
 415                 }
 416         }
 417
 418         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 419 out:
 420         spin_unlock_bh(&ptype_lock);
 421 }
 422 EXPORT_SYMBOL(__dev_remove_pack);
 423
 424 /**
 425  *      dev_remove_pack  - remove packet handler
 426  *      @pt: packet type declaration
 427  *
 428  *      Remove a protocol handler that was previously added to the kernel
 429  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430  *      from the kernel lists and can be freed or reused once this function
 431  *      returns.
 432  *
 433  *      This call sleeps to guarantee that no CPU is looking at the packet
 434  *      type after return.
 435  */
 436 void dev_remove_pack(struct packet_type *pt)
 437 {
 438         __dev_remove_pack(pt);
 439
 440         synchronize_net();
 441 }
 442 EXPORT_SYMBOL(dev_remove_pack);
 443
 444 /******************************************************************************
 445
 446                       Device Boot-time Settings Routines
 447
 448 *******************************************************************************/
 449
 450 /* Boot time configuration table */
 451 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 452
 453 /**
 454  *      netdev_boot_setup_add   - add new setup entry
 455  *      @name: name of the device
 456  *      @map: configured settings for the device
 457  *
 458  *      Adds new setup entry to the dev_boot_setup list.  The function
 459  *      returns 0 on error and 1 on success.  This is a generic routine to
 460  *      all netdevices.
 461  */
 462 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 463 {
 464         struct netdev_boot_setup *s;
 465         int i;
 466
 467         s = dev_boot_setup;
 468         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 469                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 470                         memset(s[i].name, 0, sizeof(s[i].name));
 471                         strlcpy(s[i].name, name, IFNAMSIZ);
 472                         memcpy(&s[i].map, map, sizeof(s[i].map));
 473                         break;
 474                 }
 475         }
 476
 477         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 478 }
 479
 480 /**
 481  *      netdev_boot_setup_check - check boot time settings
 482  *      @dev: the netdevice
 483  *
 484  *      Check boot time settings for the device.
 485  *      The found settings are set for the device to be used
 486  *      later in the device probing.
 487  *      Returns 0 if no settings found, 1 if they are.
 488  */
 489 int netdev_boot_setup_check(struct net_device *dev)
 490 {
 491         struct netdev_boot_setup *s = dev_boot_setup;
 492         int i;
 493
 494         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 495                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 496                     !strcmp(dev->name, s[i].name)) {
 497                         dev->irq        = s[i].map.irq;
 498                         dev->base_addr  = s[i].map.base_addr;
 499                         dev->mem_start  = s[i].map.mem_start;
 500                         dev->mem_end    = s[i].map.mem_end;
 501                         return 1;
 502                 }
 503         }
 504         return 0;
 505 }
 506 EXPORT_SYMBOL(netdev_boot_setup_check);
 507
 508
 509 /**
 510  *      netdev_boot_base        - get address from boot time settings
 511  *      @prefix: prefix for network device
 512  *      @unit: id for network device
 513  *
 514  *      Check boot time settings for the base address of device.
 515  *      The found settings are set for the device to be used
 516  *      later in the device probing.
 517  *      Returns 0 if no settings found.
 518  */
 519 unsigned long netdev_boot_base(const char *prefix, int unit)
 520 {
 521         const struct netdev_boot_setup *s = dev_boot_setup;
 522         char name[IFNAMSIZ];
 523         int i;
 524
 525         sprintf(name, "%s%d", prefix, unit);
 526
 527         /*
 528          * If device already registered then return base of 1
 529          * to indicate not to probe for this interface
 530          */
 531         if (__dev_get_by_name(&init_net, name))
 532                 return 1;
 533
 534         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 535                 if (!strcmp(name, s[i].name))
 536                         return s[i].map.base_addr;
 537         return 0;
 538 }
 539
 540 /*
 541  * Saves at boot time configured settings for any netdevice.
 542  */
 543 int __init netdev_boot_setup(char *str)
 544 {
 545         int ints[5];
 546         struct ifmap map;
 547
 548         str = get_options(str, ARRAY_SIZE(ints), ints);
 549         if (!str || !*str)
 550                 return 0;
 551
 552         /* Save settings */
 553         memset(&map, 0, sizeof(map));
 554         if (ints[0] > 0)
 555                 map.irq = ints[1];
 556         if (ints[0] > 1)
 557                 map.base_addr = ints[2];
 558         if (ints[0] > 2)
 559                 map.mem_start = ints[3];
 560         if (ints[0] > 3)
 561                 map.mem_end = ints[4];
 562
 563         /* Add new entry to the list */
 564         return netdev_boot_setup_add(str, &map);
 565 }
 566
 567 __setup("netdev=", netdev_boot_setup);
 568
 569 /*******************************************************************************
 570
 571                             Device Interface Subroutines
 572
 573 *******************************************************************************/
 574
 575 /**
 576  *      __dev_get_by_name       - find a device by its name
 577  *      @net: the applicable net namespace
 578  *      @name: name to find
 579  *
 580  *      Find an interface by name. Must be called under RTNL semaphore
 581  *      or @dev_base_lock. If the name is found a pointer to the device
 582  *      is returned. If the name is not found then %NULL is returned. The
 583  *      reference counters are not incremented so the caller must be
 584  *      careful with locks.
 585  */
 586
 587 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 588 {
 589         struct hlist_node *p;
 590
 591         hlist_for_each(p, dev_name_hash(net, name)) {
 592                 struct net_device *dev
 593                         = hlist_entry(p, struct net_device, name_hlist);
 594                 if (!strncmp(dev->name, name, IFNAMSIZ))
 595                         return dev;
 596         }
 597         return NULL;
 598 }
 599 EXPORT_SYMBOL(__dev_get_by_name);
 600
 601 /**
 602  *      dev_get_by_name         - find a device by its name
 603  *      @net: the applicable net namespace
 604  *      @name: name to find
 605  *
 606  *      Find an interface by name. This can be called from any
 607  *      context and does its own locking. The returned handle has
 608  *      the usage count incremented and the caller must use dev_put() to
 609  *      release it when it is no longer needed. %NULL is returned if no
 610  *      matching device is found.
 611  */
 612
 613 struct net_device *dev_get_by_name(struct net *net, const char *name)
 614 {
 615         struct net_device *dev;
 616
 617         read_lock(&dev_base_lock);
 618         dev = __dev_get_by_name(net, name);
 619         if (dev)
 620                 dev_hold(dev);
 621         read_unlock(&dev_base_lock);
 622         return dev;
 623 }
 624 EXPORT_SYMBOL(dev_get_by_name);
 625
 626 /**
 627  *      __dev_get_by_index - find a device by its ifindex
 628  *      @net: the applicable net namespace
 629  *      @ifindex: index of device
 630  *
 631  *      Search for an interface by index. Returns %NULL if the device
 632  *      is not found or a pointer to the device. The device has not
 633  *      had its reference counter increased so the caller must be careful
 634  *      about locking. The caller must hold either the RTNL semaphore
 635  *      or @dev_base_lock.
 636  */
 637
 638 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 639 {
 640         struct hlist_node *p;
 641
 642         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 643                 struct net_device *dev
 644                         = hlist_entry(p, struct net_device, index_hlist);
 645                 if (dev->ifindex == ifindex)
 646                         return dev;
 647         }
 648         return NULL;
 649 }
 650 EXPORT_SYMBOL(__dev_get_by_index);
 651
 652 /**
 653  *      dev_get_by_index_rcu - find a device by its ifindex
 654  *      @net: the applicable net namespace
 655  *      @ifindex: index of device
 656  *
 657  *      Search for an interface by index. Returns %NULL if the device
 658  *      is not found or a pointer to the device. The device has not
 659  *      had its reference counter increased so the caller must be careful
 660  *      about locking. The caller must hold RCU lock.
 661  */
 662
 663 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 664 {
 665         struct hlist_node *p;
 666         struct net_device *dev;
 667         struct hlist_head *head = dev_index_hash(net, ifindex);
 668
 669         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 670                 if (dev->ifindex == ifindex)
 671                         return dev;
 672
 673         return NULL;
 674 }
 675 EXPORT_SYMBOL(dev_get_by_index_rcu);
 676
 677
 678 /**
 679  *      dev_get_by_index - find a device by its ifindex
 680  *      @net: the applicable net namespace
 681  *      @ifindex: index of device
 682  *
 683  *      Search for an interface by index. Returns NULL if the device
 684  *      is not found or a pointer to the device. The device returned has
 685  *      had a reference added and the pointer is safe until the user calls
 686  *      dev_put to indicate they have finished with it.
 687  */
 688
 689 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 690 {
 691         struct net_device *dev;
 692
 693         rcu_read_lock();
 694         dev = dev_get_by_index_rcu(net, ifindex);
 695         if (dev)
 696                 dev_hold(dev);
 697         rcu_read_unlock();
 698         return dev;
 699 }
 700 EXPORT_SYMBOL(dev_get_by_index);
 701
 702 /**
 703  *      dev_getbyhwaddr - find a device by its hardware address
 704  *      @net: the applicable net namespace
 705  *      @type: media type of device
 706  *      @ha: hardware address
 707  *
 708  *      Search for an interface by MAC address. Returns NULL if the device
 709  *      is not found or a pointer to the device. The caller must hold the
 710  *      rtnl semaphore. The returned device has not had its ref count increased
 711  *      and the caller must therefore be careful about locking
 712  *
 713  *      BUGS:
 714  *      If the API was consistent this would be __dev_get_by_hwaddr
 715  */
 716
 717 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 718 {
 719         struct net_device *dev;
 720
 721         ASSERT_RTNL();
 722
 723         for_each_netdev(net, dev)
 724                 if (dev->type == type &&
 725                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 726                         return dev;
 727
 728         return NULL;
 729 }
 730 EXPORT_SYMBOL(dev_getbyhwaddr);
 731
 732 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 733 {
 734         struct net_device *dev;
 735
 736         ASSERT_RTNL();
 737         for_each_netdev(net, dev)
 738                 if (dev->type == type)
 739                         return dev;
 740
 741         return NULL;
 742 }
 743 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 744
 745 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 746 {
 747         struct net_device *dev;
 748
 749         rtnl_lock();
 750         dev = __dev_getfirstbyhwtype(net, type);
 751         if (dev)
 752                 dev_hold(dev);
 753         rtnl_unlock();
 754         return dev;
 755 }
 756 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 757
 758 /**
 759  *      dev_get_by_flags - find any device with given flags
 760  *      @net: the applicable net namespace
 761  *      @if_flags: IFF_* values
 762  *      @mask: bitmask of bits in if_flags to check
 763  *
 764  *      Search for any interface with the given flags. Returns NULL if a device
 765  *      is not found or a pointer to the device. The device returned has
 766  *      had a reference added and the pointer is safe until the user calls
 767  *      dev_put to indicate they have finished with it.
 768  */
 769
 770 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 771                                     unsigned short mask)
 772 {
 773         struct net_device *dev, *ret;
 774
 775         ret = NULL;
 776         read_lock(&dev_base_lock);
 777         for_each_netdev(net, dev) {
 778                 if (((dev->flags ^ if_flags) & mask) == 0) {
 779                         dev_hold(dev);
 780                         ret = dev;
 781                         break;
 782                 }
 783         }
 784         read_unlock(&dev_base_lock);
 785         return ret;
 786 }
 787 EXPORT_SYMBOL(dev_get_by_flags);
 788
 789 /**
 790  *      dev_valid_name - check if name is okay for network device
 791  *      @name: name string
 792  *
 793  *      Network device names need to be valid file names to
 794  *      to allow sysfs to work.  We also disallow any kind of
 795  *      whitespace.
 796  */
 797 int dev_valid_name(const char *name)
 798 {
 799         if (*name == '\0')
 800                 return 0;
 801         if (strlen(name) >= IFNAMSIZ)
 802                 return 0;
 803         if (!strcmp(name, ".") || !strcmp(name, ".."))
 804                 return 0;
 805
 806         while (*name) {
 807                 if (*name == '/' || isspace(*name))
 808                         return 0;
 809                 name++;
 810         }
 811         return 1;
 812 }
 813 EXPORT_SYMBOL(dev_valid_name);
 814
 815 /**
 816  *      __dev_alloc_name - allocate a name for a device
 817  *      @net: network namespace to allocate the device name in
 818  *      @name: name format string
 819  *      @buf:  scratch buffer and result name string
 820  *
 821  *      Passed a format string - eg "lt%d" it will try and find a suitable
 822  *      id. It scans list of devices to build up a free map, then chooses
 823  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 824  *      while allocating the name and adding the device in order to avoid
 825  *      duplicates.
 826  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 827  *      Returns the number of the unit assigned or a negative errno code.
 828  */
 829
 830 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 831 {
 832         int i = 0;
 833         const char *p;
 834         const int max_netdevices = 8*PAGE_SIZE;
 835         unsigned long *inuse;
 836         struct net_device *d;
 837
 838         p = strnchr(name, IFNAMSIZ-1, '%');
 839         if (p) {
 840                 /*
 841                  * Verify the string as this thing may have come from
 842                  * the user.  There must be either one "%d" and no other "%"
 843                  * characters.
 844                  */
 845                 if (p[1] != 'd' || strchr(p + 2, '%'))
 846                         return -EINVAL;
 847
 848                 /* Use one page as a bit array of possible slots */
 849                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 850                 if (!inuse)
 851                         return -ENOMEM;
 852
 853                 for_each_netdev(net, d) {
 854                         if (!sscanf(d->name, name, &i))
 855                                 continue;
 856                         if (i < 0 || i >= max_netdevices)
 857                                 continue;
 858
 859                         /*  avoid cases where sscanf is not exact inverse of printf */
 860                         snprintf(buf, IFNAMSIZ, name, i);
 861                         if (!strncmp(buf, d->name, IFNAMSIZ))
 862                                 set_bit(i, inuse);
 863                 }
 864
 865                 i = find_first_zero_bit(inuse, max_netdevices);
 866                 free_page((unsigned long) inuse);
 867         }
 868
 869         snprintf(buf, IFNAMSIZ, name, i);
 870         if (!__dev_get_by_name(net, buf))
 871                 return i;
 872
 873         /* It is possible to run out of possible slots
 874          * when the name is long and there isn't enough space left
 875          * for the digits, or if all bits are used.
 876          */
 877         return -ENFILE;
 878 }
 879
 880 /**
 881  *      dev_alloc_name - allocate a name for a device
 882  *      @dev: device
 883  *      @name: name format string
 884  *
 885  *      Passed a format string - eg "lt%d" it will try and find a suitable
 886  *      id. It scans list of devices to build up a free map, then chooses
 887  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 888  *      while allocating the name and adding the device in order to avoid
 889  *      duplicates.
 890  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 891  *      Returns the number of the unit assigned or a negative errno code.
 892  */
 893
 894 int dev_alloc_name(struct net_device *dev, const char *name)
 895 {
 896         char buf[IFNAMSIZ];
 897         struct net *net;
 898         int ret;
 899
 900         BUG_ON(!dev_net(dev));
 901         net = dev_net(dev);
 902         ret = __dev_alloc_name(net, name, buf);
 903         if (ret >= 0)
 904                 strlcpy(dev->name, buf, IFNAMSIZ);
 905         return ret;
 906 }
 907 EXPORT_SYMBOL(dev_alloc_name);
 908
 909
 910 /**
 911  *      dev_change_name - change name of a device
 912  *      @dev: device
 913  *      @newname: name (or format string) must be at least IFNAMSIZ
 914  *
 915  *      Change name of a device, can pass format strings "eth%d".
 916  *      for wildcarding.
 917  */
 918 int dev_change_name(struct net_device *dev, const char *newname)
 919 {
 920         char oldname[IFNAMSIZ];
 921         int err = 0;
 922         int ret;
 923         struct net *net;
 924
 925         ASSERT_RTNL();
 926         BUG_ON(!dev_net(dev));
 927
 928         net = dev_net(dev);
 929         if (dev->flags & IFF_UP)
 930                 return -EBUSY;
 931
 932         if (!dev_valid_name(newname))
 933                 return -EINVAL;
 934
 935         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 936                 return 0;
 937
 938         memcpy(oldname, dev->name, IFNAMSIZ);
 939
 940         if (strchr(newname, '%')) {
 941                 err = dev_alloc_name(dev, newname);
 942                 if (err < 0)
 943                         return err;
 944         } else if (__dev_get_by_name(net, newname))
 945                 return -EEXIST;
 946         else
 947                 strlcpy(dev->name, newname, IFNAMSIZ);
 948
 949 rollback:
 950         /* For now only devices in the initial network namespace
 951          * are in sysfs.
 952          */
 953         if (net == &init_net) {
 954                 ret = device_rename(&dev->dev, dev->name);
 955                 if (ret) {
 956                         memcpy(dev->name, oldname, IFNAMSIZ);
 957                         return ret;
 958                 }
 959         }
 960
 961         write_lock_bh(&dev_base_lock);
 962         hlist_del(&dev->name_hlist);
 963         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 964         write_unlock_bh(&dev_base_lock);
 965
 966         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 967         ret = notifier_to_errno(ret);
 968
 969         if (ret) {
 970                 if (err) {
 971                         printk(KERN_ERR
 972                                "%s: name change rollback failed: %d.\n",
 973                                dev->name, ret);
 974                 } else {
 975                         err = ret;
 976                         memcpy(dev->name, oldname, IFNAMSIZ);
 977                         goto rollback;
 978                 }
 979         }
 980
 981         return err;
 982 }
 983
 984 /**
 985  *      dev_set_alias - change ifalias of a device
 986  *      @dev: device
 987  *      @alias: name up to IFALIASZ
 988  *      @len: limit of bytes to copy from info
 989  *
 990  *      Set ifalias for a device,
 991  */
 992 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 993 {
 994         ASSERT_RTNL();
 995
 996         if (len >= IFALIASZ)
 997                 return -EINVAL;
 998
 999         if (!len) {
1000                 if (dev->ifalias) {
1001                         kfree(dev->ifalias);
1002                         dev->ifalias = NULL;
1003                 }
1004                 return 0;
1005         }
1006
1007         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1008         if (!dev->ifalias)
1009                 return -ENOMEM;
1010
1011         strlcpy(dev->ifalias, alias, len+1);
1012         return len;
1013 }
1014
1015
1016 /**
1017  *      netdev_features_change - device changes features
1018  *      @dev: device to cause notification
1019  *
1020  *      Called to indicate a device has changed features.
1021  */
1022 void netdev_features_change(struct net_device *dev)
1023 {
1024         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1025 }
1026 EXPORT_SYMBOL(netdev_features_change);
1027
1028 /**
1029  *      netdev_state_change - device changes state
1030  *      @dev: device to cause notification
1031  *
1032  *      Called to indicate a device has changed state. This function calls
1033  *      the notifier chains for netdev_chain and sends a NEWLINK message
1034  *      to the routing socket.
1035  */
1036 void netdev_state_change(struct net_device *dev)
1037 {
1038         if (dev->flags & IFF_UP) {
1039                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1040                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1041         }
1042 }
1043 EXPORT_SYMBOL(netdev_state_change);
1044
1045 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1046 {
1047         call_netdevice_notifiers(event, dev);
1048 }
1049 EXPORT_SYMBOL(netdev_bonding_change);
1050
1051 /**
1052  *      dev_load        - load a network module
1053  *      @net: the applicable net namespace
1054  *      @name: name of interface
1055  *
1056  *      If a network interface is not present and the process has suitable
1057  *      privileges this function loads the module. If module loading is not
1058  *      available in this kernel then it becomes a nop.
1059  */
1060
1061 void dev_load(struct net *net, const char *name)
1062 {
1063         struct net_device *dev;
1064
1065         read_lock(&dev_base_lock);
1066         dev = __dev_get_by_name(net, name);
1067         read_unlock(&dev_base_lock);
1068
1069         if (!dev && capable(CAP_NET_ADMIN))
1070                 request_module("%s", name);
1071 }
1072 EXPORT_SYMBOL(dev_load);
1073
1074 /**
1075  *      dev_open        - prepare an interface for use.
1076  *      @dev:   device to open
1077  *
1078  *      Takes a device from down to up state. The device's private open
1079  *      function is invoked and then the multicast lists are loaded. Finally
1080  *      the device is moved into the up state and a %NETDEV_UP message is
1081  *      sent to the netdev notifier chain.
1082  *
1083  *      Calling this function on an active interface is a nop. On a failure
1084  *      a negative errno code is returned.
1085  */
1086 int dev_open(struct net_device *dev)
1087 {
1088         const struct net_device_ops *ops = dev->netdev_ops;
1089         int ret;
1090
1091         ASSERT_RTNL();
1092
1093         /*
1094          *      Is it already up?
1095          */
1096
1097         if (dev->flags & IFF_UP)
1098                 return 0;
1099
1100         /*
1101          *      Is it even present?
1102          */
1103         if (!netif_device_present(dev))
1104                 return -ENODEV;
1105
1106         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1107         ret = notifier_to_errno(ret);
1108         if (ret)
1109                 return ret;
1110
1111         /*
1112          *      Call device private open method
1113          */
1114         set_bit(__LINK_STATE_START, &dev->state);
1115
1116         if (ops->ndo_validate_addr)
1117                 ret = ops->ndo_validate_addr(dev);
1118
1119         if (!ret && ops->ndo_open)
1120                 ret = ops->ndo_open(dev);
1121
1122         /*
1123          *      If it went open OK then:
1124          */
1125
1126         if (ret)
1127                 clear_bit(__LINK_STATE_START, &dev->state);
1128         else {
1129                 /*
1130                  *      Set the flags.
1131                  */
1132                 dev->flags |= IFF_UP;
1133
1134                 /*
1135                  *      Enable NET_DMA
1136                  */
1137                 net_dmaengine_get();
1138
1139                 /*
1140                  *      Initialize multicasting status
1141                  */
1142                 dev_set_rx_mode(dev);
1143
1144                 /*
1145                  *      Wakeup transmit queue engine
1146                  */
1147                 dev_activate(dev);
1148
1149                 /*
1150                  *      ... and announce new interface.
1151                  */
1152                 call_netdevice_notifiers(NETDEV_UP, dev);
1153         }
1154
1155         return ret;
1156 }
1157 EXPORT_SYMBOL(dev_open);
1158
1159 /**
1160  *      dev_close - shutdown an interface.
1161  *      @dev: device to shutdown
1162  *
1163  *      This function moves an active device into down state. A
1164  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1165  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1166  *      chain.
1167  */
1168 int dev_close(struct net_device *dev)
1169 {
1170         const struct net_device_ops *ops = dev->netdev_ops;
1171         ASSERT_RTNL();
1172
1173         might_sleep();
1174
1175         if (!(dev->flags & IFF_UP))
1176                 return 0;
1177
1178         /*
1179          *      Tell people we are going down, so that they can
1180          *      prepare to death, when device is still operating.
1181          */
1182         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1183
1184         clear_bit(__LINK_STATE_START, &dev->state);
1185
1186         /* Synchronize to scheduled poll. We cannot touch poll list,
1187          * it can be even on different cpu. So just clear netif_running().
1188          *
1189          * dev->stop() will invoke napi_disable() on all of it's
1190          * napi_struct instances on this device.
1191          */
1192         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1193
1194         dev_deactivate(dev);
1195
1196         /*
1197          *      Call the device specific close. This cannot fail.
1198          *      Only if device is UP
1199          *
1200          *      We allow it to be called even after a DETACH hot-plug
1201          *      event.
1202          */
1203         if (ops->ndo_stop)
1204                 ops->ndo_stop(dev);
1205
1206         /*
1207          *      Device is now down.
1208          */
1209
1210         dev->flags &= ~IFF_UP;
1211
1212         /*
1213          * Tell people we are down
1214          */
1215         call_netdevice_notifiers(NETDEV_DOWN, dev);
1216
1217         /*
1218          *      Shutdown NET_DMA
1219          */
1220         net_dmaengine_put();
1221
1222         return 0;
1223 }
1224 EXPORT_SYMBOL(dev_close);
1225
1226
1227 /**
1228  *      dev_disable_lro - disable Large Receive Offload on a device
1229  *      @dev: device
1230  *
1231  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1232  *      called under RTNL.  This is needed if received packets may be
1233  *      forwarded to another interface.
1234  */
1235 void dev_disable_lro(struct net_device *dev)
1236 {
1237         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1238             dev->ethtool_ops->set_flags) {
1239                 u32 flags = dev->ethtool_ops->get_flags(dev);
1240                 if (flags & ETH_FLAG_LRO) {
1241                         flags &= ~ETH_FLAG_LRO;
1242                         dev->ethtool_ops->set_flags(dev, flags);
1243                 }
1244         }
1245         WARN_ON(dev->features & NETIF_F_LRO);
1246 }
1247 EXPORT_SYMBOL(dev_disable_lro);
1248
1249
1250 static int dev_boot_phase = 1;
1251
1252 /*
1253  *      Device change register/unregister. These are not inline or static
1254  *      as we export them to the world.
1255  */
1256
1257 /**
1258  *      register_netdevice_notifier - register a network notifier block
1259  *      @nb: notifier
1260  *
1261  *      Register a notifier to be called when network device events occur.
1262  *      The notifier passed is linked into the kernel structures and must
1263  *      not be reused until it has been unregistered. A negative errno code
1264  *      is returned on a failure.
1265  *
1266  *      When registered all registration and up events are replayed
1267  *      to the new notifier to allow device to have a race free
1268  *      view of the network device list.
1269  */
1270
1271 int register_netdevice_notifier(struct notifier_block *nb)
1272 {
1273         struct net_device *dev;
1274         struct net_device *last;
1275         struct net *net;
1276         int err;
1277
1278         rtnl_lock();
1279         err = raw_notifier_chain_register(&netdev_chain, nb);
1280         if (err)
1281                 goto unlock;
1282         if (dev_boot_phase)
1283                 goto unlock;
1284         for_each_net(net) {
1285                 for_each_netdev(net, dev) {
1286                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1287                         err = notifier_to_errno(err);
1288                         if (err)
1289                                 goto rollback;
1290
1291                         if (!(dev->flags & IFF_UP))
1292                                 continue;
1293
1294                         nb->notifier_call(nb, NETDEV_UP, dev);
1295                 }
1296         }
1297
1298 unlock:
1299         rtnl_unlock();
1300         return err;
1301
1302 rollback:
1303         last = dev;
1304         for_each_net(net) {
1305                 for_each_netdev(net, dev) {
1306                         if (dev == last)
1307                                 break;
1308
1309                         if (dev->flags & IFF_UP) {
1310                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1311                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1312                         }
1313                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1314                 }
1315         }
1316
1317         raw_notifier_chain_unregister(&netdev_chain, nb);
1318         goto unlock;
1319 }
1320 EXPORT_SYMBOL(register_netdevice_notifier);
1321
1322 /**
1323  *      unregister_netdevice_notifier - unregister a network notifier block
1324  *      @nb: notifier
1325  *
1326  *      Unregister a notifier previously registered by
1327  *      register_netdevice_notifier(). The notifier is unlinked into the
1328  *      kernel structures and may then be reused. A negative errno code
1329  *      is returned on a failure.
1330  */
1331
1332 int unregister_netdevice_notifier(struct notifier_block *nb)
1333 {
1334         int err;
1335
1336         rtnl_lock();
1337         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1338         rtnl_unlock();
1339         return err;
1340 }
1341 EXPORT_SYMBOL(unregister_netdevice_notifier);
1342
1343 /**
1344  *      call_netdevice_notifiers - call all network notifier blocks
1345  *      @val: value passed unmodified to notifier function
1346  *      @dev: net_device pointer passed unmodified to notifier function
1347  *
1348  *      Call all network notifier blocks.  Parameters and return value
1349  *      are as for raw_notifier_call_chain().
1350  */
1351
1352 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1353 {
1354         return raw_notifier_call_chain(&netdev_chain, val, dev);
1355 }
1356
1357 /* When > 0 there are consumers of rx skb time stamps */
1358 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1359
1360 void net_enable_timestamp(void)
1361 {
1362         atomic_inc(&netstamp_needed);
1363 }
1364 EXPORT_SYMBOL(net_enable_timestamp);
1365
1366 void net_disable_timestamp(void)
1367 {
1368         atomic_dec(&netstamp_needed);
1369 }
1370 EXPORT_SYMBOL(net_disable_timestamp);
1371
1372 static inline void net_timestamp(struct sk_buff *skb)
1373 {
1374         if (atomic_read(&netstamp_needed))
1375                 __net_timestamp(skb);
1376         else
1377                 skb->tstamp.tv64 = 0;
1378 }
1379
1380 /*
1381  *      Support routine. Sends outgoing frames to any network
1382  *      taps currently in use.
1383  */
1384
1385 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1386 {
1387         struct packet_type *ptype;
1388
1389 #ifdef CONFIG_NET_CLS_ACT
1390         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1391                 net_timestamp(skb);
1392 #else
1393         net_timestamp(skb);
1394 #endif
1395
1396         rcu_read_lock();
1397         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1398                 /* Never send packets back to the socket
1399                  * they originated from - MvS (miquels@drinkel.ow.org)
1400                  */
1401                 if ((ptype->dev == dev || !ptype->dev) &&
1402                     (ptype->af_packet_priv == NULL ||
1403                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1404                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1405                         if (!skb2)
1406                                 break;
1407
1408                         /* skb->nh should be correctly
1409                            set by sender, so that the second statement is
1410                            just protection against buggy protocols.
1411                          */
1412                         skb_reset_mac_header(skb2);
1413
1414                         if (skb_network_header(skb2) < skb2->data ||
1415                             skb2->network_header > skb2->tail) {
1416                                 if (net_ratelimit())
1417                                         printk(KERN_CRIT "protocol %04x is "
1418                                                "buggy, dev %s\n",
1419                                                skb2->protocol, dev->name);
1420                                 skb_reset_network_header(skb2);
1421                         }
1422
1423                         skb2->transport_header = skb2->network_header;
1424                         skb2->pkt_type = PACKET_OUTGOING;
1425                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1426                 }
1427         }
1428         rcu_read_unlock();
1429 }
1430
1431
1432 static inline void __netif_reschedule(struct Qdisc *q)
1433 {
1434         struct softnet_data *sd;
1435         unsigned long flags;
1436
1437         local_irq_save(flags);
1438         sd = &__get_cpu_var(softnet_data);
1439         q->next_sched = sd->output_queue;
1440         sd->output_queue = q;
1441         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1442         local_irq_restore(flags);
1443 }
1444
1445 void __netif_schedule(struct Qdisc *q)
1446 {
1447         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1448                 __netif_reschedule(q);
1449 }
1450 EXPORT_SYMBOL(__netif_schedule);
1451
1452 void dev_kfree_skb_irq(struct sk_buff *skb)
1453 {
1454         if (atomic_dec_and_test(&skb->users)) {
1455                 struct softnet_data *sd;
1456                 unsigned long flags;
1457
1458                 local_irq_save(flags);
1459                 sd = &__get_cpu_var(softnet_data);
1460                 skb->next = sd->completion_queue;
1461                 sd->completion_queue = skb;
1462                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1463                 local_irq_restore(flags);
1464         }
1465 }
1466 EXPORT_SYMBOL(dev_kfree_skb_irq);
1467
1468 void dev_kfree_skb_any(struct sk_buff *skb)
1469 {
1470         if (in_irq() || irqs_disabled())
1471                 dev_kfree_skb_irq(skb);
1472         else
1473                 dev_kfree_skb(skb);
1474 }
1475 EXPORT_SYMBOL(dev_kfree_skb_any);
1476
1477
1478 /**
1479  * netif_device_detach - mark device as removed
1480  * @dev: network device
1481  *
1482  * Mark device as removed from system and therefore no longer available.
1483  */
1484 void netif_device_detach(struct net_device *dev)
1485 {
1486         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1487             netif_running(dev)) {
1488                 netif_tx_stop_all_queues(dev);
1489         }
1490 }
1491 EXPORT_SYMBOL(netif_device_detach);
1492
1493 /**
1494  * netif_device_attach - mark device as attached
1495  * @dev: network device
1496  *
1497  * Mark device as attached from system and restart if needed.
1498  */
1499 void netif_device_attach(struct net_device *dev)
1500 {
1501         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1502             netif_running(dev)) {
1503                 netif_tx_wake_all_queues(dev);
1504                 __netdev_watchdog_up(dev);
1505         }
1506 }
1507 EXPORT_SYMBOL(netif_device_attach);
1508
1509 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1510 {
1511         return ((features & NETIF_F_GEN_CSUM) ||
1512                 ((features & NETIF_F_IP_CSUM) &&
1513                  protocol == htons(ETH_P_IP)) ||
1514                 ((features & NETIF_F_IPV6_CSUM) &&
1515                  protocol == htons(ETH_P_IPV6)) ||
1516                 ((features & NETIF_F_FCOE_CRC) &&
1517                  protocol == htons(ETH_P_FCOE)));
1518 }
1519
1520 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1521 {
1522         if (can_checksum_protocol(dev->features, skb->protocol))
1523                 return true;
1524
1525         if (skb->protocol == htons(ETH_P_8021Q)) {
1526                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1527                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1528                                           veh->h_vlan_encapsulated_proto))
1529                         return true;
1530         }
1531
1532         return false;
1533 }
1534
1535 /*
1536  * Invalidate hardware checksum when packet is to be mangled, and
1537  * complete checksum manually on outgoing path.
1538  */
1539 int skb_checksum_help(struct sk_buff *skb)
1540 {
1541         __wsum csum;
1542         int ret = 0, offset;
1543
1544         if (skb->ip_summed == CHECKSUM_COMPLETE)
1545                 goto out_set_summed;
1546
1547         if (unlikely(skb_shinfo(skb)->gso_size)) {
1548                 /* Let GSO fix up the checksum. */
1549                 goto out_set_summed;
1550         }
1551
1552         offset = skb->csum_start - skb_headroom(skb);
1553         BUG_ON(offset >= skb_headlen(skb));
1554         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1555
1556         offset += skb->csum_offset;
1557         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1558
1559         if (skb_cloned(skb) &&
1560             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1561                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1562                 if (ret)
1563                         goto out;
1564         }
1565
1566         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1567 out_set_summed:
1568         skb->ip_summed = CHECKSUM_NONE;
1569 out:
1570         return ret;
1571 }
1572 EXPORT_SYMBOL(skb_checksum_help);
1573
1574 /**
1575  *      skb_gso_segment - Perform segmentation on skb.
1576  *      @skb: buffer to segment
1577  *      @features: features for the output path (see dev->features)
1578  *
1579  *      This function segments the given skb and returns a list of segments.
1580  *
1581  *      It may return NULL if the skb requires no segmentation.  This is
1582  *      only possible when GSO is used for verifying header integrity.
1583  */
1584 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1585 {
1586         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1587         struct packet_type *ptype;
1588         __be16 type = skb->protocol;
1589         int err;
1590
1591         skb_reset_mac_header(skb);
1592         skb->mac_len = skb->network_header - skb->mac_header;
1593         __skb_pull(skb, skb->mac_len);
1594
1595         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1596                 struct net_device *dev = skb->dev;
1597                 struct ethtool_drvinfo info = {};
1598
1599                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1600                         dev->ethtool_ops->get_drvinfo(dev, &info);
1601
1602                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1603                         "ip_summed=%d",
1604                      info.driver, dev ? dev->features : 0L,
1605                      skb->sk ? skb->sk->sk_route_caps : 0L,
1606                      skb->len, skb->data_len, skb->ip_summed);
1607
1608                 if (skb_header_cloned(skb) &&
1609                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1610                         return ERR_PTR(err);
1611         }
1612
1613         rcu_read_lock();
1614         list_for_each_entry_rcu(ptype,
1615                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1616                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1617                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1618                                 err = ptype->gso_send_check(skb);
1619                                 segs = ERR_PTR(err);
1620                                 if (err || skb_gso_ok(skb, features))
1621                                         break;
1622                                 __skb_push(skb, (skb->data -
1623                                                  skb_network_header(skb)));
1624                         }
1625                         segs = ptype->gso_segment(skb, features);
1626                         break;
1627                 }
1628         }
1629         rcu_read_unlock();
1630
1631         __skb_push(skb, skb->data - skb_mac_header(skb));
1632
1633         return segs;
1634 }
1635 EXPORT_SYMBOL(skb_gso_segment);
1636
1637 /* Take action when hardware reception checksum errors are detected. */
1638 #ifdef CONFIG_BUG
1639 void netdev_rx_csum_fault(struct net_device *dev)
1640 {
1641         if (net_ratelimit()) {
1642                 printk(KERN_ERR "%s: hw csum failure.\n",
1643                         dev ? dev->name : "<unknown>");
1644                 dump_stack();
1645         }
1646 }
1647 EXPORT_SYMBOL(netdev_rx_csum_fault);
1648 #endif
1649
1650 /* Actually, we should eliminate this check as soon as we know, that:
1651  * 1. IOMMU is present and allows to map all the memory.
1652  * 2. No high memory really exists on this machine.
1653  */
1654
1655 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1656 {
1657 #ifdef CONFIG_HIGHMEM
1658         int i;
1659
1660         if (dev->features & NETIF_F_HIGHDMA)
1661                 return 0;
1662
1663         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1664                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1665                         return 1;
1666
1667 #endif
1668         return 0;
1669 }
1670
1671 struct dev_gso_cb {
1672         void (*destructor)(struct sk_buff *skb);
1673 };
1674
1675 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1676
1677 static void dev_gso_skb_destructor(struct sk_buff *skb)
1678 {
1679         struct dev_gso_cb *cb;
1680
1681         do {
1682                 struct sk_buff *nskb = skb->next;
1683
1684                 skb->next = nskb->next;
1685                 nskb->next = NULL;
1686                 kfree_skb(nskb);
1687         } while (skb->next);
1688
1689         cb = DEV_GSO_CB(skb);
1690         if (cb->destructor)
1691                 cb->destructor(skb);
1692 }
1693
1694 /**
1695  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1696  *      @skb: buffer to segment
1697  *
1698  *      This function segments the given skb and stores the list of segments
1699  *      in skb->next.
1700  */
1701 static int dev_gso_segment(struct sk_buff *skb)
1702 {
1703         struct net_device *dev = skb->dev;
1704         struct sk_buff *segs;
1705         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1706                                          NETIF_F_SG : 0);
1707
1708         segs = skb_gso_segment(skb, features);
1709
1710         /* Verifying header integrity only. */
1711         if (!segs)
1712                 return 0;
1713
1714         if (IS_ERR(segs))
1715                 return PTR_ERR(segs);
1716
1717         skb->next = segs;
1718         DEV_GSO_CB(skb)->destructor = skb->destructor;
1719         skb->destructor = dev_gso_skb_destructor;
1720
1721         return 0;
1722 }
1723
1724 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1725                         struct netdev_queue *txq)
1726 {
1727         const struct net_device_ops *ops = dev->netdev_ops;
1728         int rc;
1729
1730         if (likely(!skb->next)) {
1731                 if (!list_empty(&ptype_all))
1732                         dev_queue_xmit_nit(skb, dev);
1733
1734                 if (netif_needs_gso(dev, skb)) {
1735                         if (unlikely(dev_gso_segment(skb)))
1736                                 goto out_kfree_skb;
1737                         if (skb->next)
1738                                 goto gso;
1739                 }
1740
1741                 /*
1742                  * If device doesnt need skb->dst, release it right now while
1743                  * its hot in this cpu cache
1744                  */
1745                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1746                         skb_dst_drop(skb);
1747
1748                 rc = ops->ndo_start_xmit(skb, dev);
1749                 if (rc == NETDEV_TX_OK)
1750                         txq_trans_update(txq);
1751                 /*
1752                  * TODO: if skb_orphan() was called by
1753                  * dev->hard_start_xmit() (for example, the unmodified
1754                  * igb driver does that; bnx2 doesn't), then
1755                  * skb_tx_software_timestamp() will be unable to send
1756                  * back the time stamp.
1757                  *
1758                  * How can this be prevented? Always create another
1759                  * reference to the socket before calling
1760                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1761                  * does anything in dev->hard_start_xmit() by clearing
1762                  * the skb destructor before the call and restoring it
1763                  * afterwards, then doing the skb_orphan() ourselves?
1764                  */
1765                 return rc;
1766         }
1767
1768 gso:
1769         do {
1770                 struct sk_buff *nskb = skb->next;
1771
1772                 skb->next = nskb->next;
1773                 nskb->next = NULL;
1774                 rc = ops->ndo_start_xmit(nskb, dev);
1775                 if (unlikely(rc != NETDEV_TX_OK)) {
1776                         nskb->next = skb->next;
1777                         skb->next = nskb;
1778                         return rc;
1779                 }
1780                 txq_trans_update(txq);
1781                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1782                         return NETDEV_TX_BUSY;
1783         } while (skb->next);
1784
1785         skb->destructor = DEV_GSO_CB(skb)->destructor;
1786
1787 out_kfree_skb:
1788         kfree_skb(skb);
1789         return NETDEV_TX_OK;
1790 }
1791
1792 static u32 skb_tx_hashrnd;
1793
1794 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1795 {
1796         u32 hash;
1797
1798         if (skb_rx_queue_recorded(skb)) {
1799                 hash = skb_get_rx_queue(skb);
1800                 while (unlikely(hash >= dev->real_num_tx_queues))
1801                         hash -= dev->real_num_tx_queues;
1802                 return hash;
1803         }
1804
1805         if (skb->sk && skb->sk->sk_hash)
1806                 hash = skb->sk->sk_hash;
1807         else
1808                 hash = skb->protocol;
1809
1810         hash = jhash_1word(hash, skb_tx_hashrnd);
1811
1812         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1813 }
1814 EXPORT_SYMBOL(skb_tx_hash);
1815
1816 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1817                                         struct sk_buff *skb)
1818 {
1819         u16 queue_index;
1820         struct sock *sk = skb->sk;
1821
1822         if (sk_tx_queue_recorded(sk)) {
1823                 queue_index = sk_tx_queue_get(sk);
1824         } else {
1825                 const struct net_device_ops *ops = dev->netdev_ops;
1826
1827                 if (ops->ndo_select_queue) {
1828                         queue_index = ops->ndo_select_queue(dev, skb);
1829                 } else {
1830                         queue_index = 0;
1831                         if (dev->real_num_tx_queues > 1)
1832                                 queue_index = skb_tx_hash(dev, skb);
1833
1834                         if (sk && sk->sk_dst_cache)
1835                                 sk_tx_queue_set(sk, queue_index);
1836                 }
1837         }
1838
1839         skb_set_queue_mapping(skb, queue_index);
1840         return netdev_get_tx_queue(dev, queue_index);
1841 }
1842
1843 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1844                                  struct net_device *dev,
1845                                  struct netdev_queue *txq)
1846 {
1847         spinlock_t *root_lock = qdisc_lock(q);
1848         int rc;
1849
1850         spin_lock(root_lock);
1851         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1852                 kfree_skb(skb);
1853                 rc = NET_XMIT_DROP;
1854         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1855                    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
1856                 /*
1857                  * This is a work-conserving queue; there are no old skbs
1858                  * waiting to be sent out; and the qdisc is not running -
1859                  * xmit the skb directly.
1860                  */
1861                 __qdisc_update_bstats(q, skb->len);
1862                 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
1863                         __qdisc_run(q);
1864                 else
1865                         clear_bit(__QDISC_STATE_RUNNING, &q->state);
1866
1867                 rc = NET_XMIT_SUCCESS;
1868         } else {
1869                 rc = qdisc_enqueue_root(skb, q);
1870                 qdisc_run(q);
1871         }
1872         spin_unlock(root_lock);
1873
1874         return rc;
1875 }
1876
1877 /**
1878  *      dev_queue_xmit - transmit a buffer
1879  *      @skb: buffer to transmit
1880  *
1881  *      Queue a buffer for transmission to a network device. The caller must
1882  *      have set the device and priority and built the buffer before calling
1883  *      this function. The function can be called from an interrupt.
1884  *
1885  *      A negative errno code is returned on a failure. A success does not
1886  *      guarantee the frame will be transmitted as it may be dropped due
1887  *      to congestion or traffic shaping.
1888  *
1889  * -----------------------------------------------------------------------------------
1890  *      I notice this method can also return errors from the queue disciplines,
1891  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1892  *      be positive.
1893  *
1894  *      Regardless of the return value, the skb is consumed, so it is currently
1895  *      difficult to retry a send to this method.  (You can bump the ref count
1896  *      before sending to hold a reference for retry if you are careful.)
1897  *
1898  *      When calling this method, interrupts MUST be enabled.  This is because
1899  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1900  *          --BLG
1901  */
1902 int dev_queue_xmit(struct sk_buff *skb)
1903 {
1904         struct net_device *dev = skb->dev;
1905         struct netdev_queue *txq;
1906         struct Qdisc *q;
1907         int rc = -ENOMEM;
1908
1909         /* GSO will handle the following emulations directly. */
1910         if (netif_needs_gso(dev, skb))
1911                 goto gso;
1912
1913         if (skb_has_frags(skb) &&
1914             !(dev->features & NETIF_F_FRAGLIST) &&
1915             __skb_linearize(skb))
1916                 goto out_kfree_skb;
1917
1918         /* Fragmented skb is linearized if device does not support SG,
1919          * or if at least one of fragments is in highmem and device
1920          * does not support DMA from it.
1921          */
1922         if (skb_shinfo(skb)->nr_frags &&
1923             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1924             __skb_linearize(skb))
1925                 goto out_kfree_skb;
1926
1927         /* If packet is not checksummed and device does not support
1928          * checksumming for this protocol, complete checksumming here.
1929          */
1930         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1931                 skb_set_transport_header(skb, skb->csum_start -
1932                                               skb_headroom(skb));
1933                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1934                         goto out_kfree_skb;
1935         }
1936
1937 gso:
1938         /* Disable soft irqs for various locks below. Also
1939          * stops preemption for RCU.
1940          */
1941         rcu_read_lock_bh();
1942
1943         txq = dev_pick_tx(dev, skb);
1944         q = rcu_dereference(txq->qdisc);
1945
1946 #ifdef CONFIG_NET_CLS_ACT
1947         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1948 #endif
1949         if (q->enqueue) {
1950                 rc = __dev_xmit_skb(skb, q, dev, txq);
1951                 goto out;
1952         }
1953
1954         /* The device has no queue. Common case for software devices:
1955            loopback, all the sorts of tunnels...
1956
1957            Really, it is unlikely that netif_tx_lock protection is necessary
1958            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1959            counters.)
1960            However, it is possible, that they rely on protection
1961            made by us here.
1962
1963            Check this and shot the lock. It is not prone from deadlocks.
1964            Either shot noqueue qdisc, it is even simpler 8)
1965          */
1966         if (dev->flags & IFF_UP) {
1967                 int cpu = smp_processor_id(); /* ok because BHs are off */
1968
1969                 if (txq->xmit_lock_owner != cpu) {
1970
1971                         HARD_TX_LOCK(dev, txq, cpu);
1972
1973                         if (!netif_tx_queue_stopped(txq)) {
1974                                 rc = NET_XMIT_SUCCESS;
1975                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1976                                         HARD_TX_UNLOCK(dev, txq);
1977                                         goto out;
1978                                 }
1979                         }
1980                         HARD_TX_UNLOCK(dev, txq);
1981                         if (net_ratelimit())
1982                                 printk(KERN_CRIT "Virtual device %s asks to "
1983                                        "queue packet!\n", dev->name);
1984                 } else {
1985                         /* Recursion is detected! It is possible,
1986                          * unfortunately */
1987                         if (net_ratelimit())
1988                                 printk(KERN_CRIT "Dead loop on virtual device "
1989                                        "%s, fix it urgently!\n", dev->name);
1990                 }
1991         }
1992
1993         rc = -ENETDOWN;
1994         rcu_read_unlock_bh();
1995
1996 out_kfree_skb:
1997         kfree_skb(skb);
1998         return rc;
1999 out:
2000         rcu_read_unlock_bh();
2001         return rc;
2002 }
2003 EXPORT_SYMBOL(dev_queue_xmit);
2004
2005
2006 /*=======================================================================
2007                         Receiver routines
2008   =======================================================================*/
2009
2010 int netdev_max_backlog __read_mostly = 1000;
2011 int netdev_budget __read_mostly = 300;
2012 int weight_p __read_mostly = 64;            /* old backlog weight */
2013
2014 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2015
2016
2017 /**
2018  *      netif_rx        -       post buffer to the network code
2019  *      @skb: buffer to post
2020  *
2021  *      This function receives a packet from a device driver and queues it for
2022  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2023  *      may be dropped during processing for congestion control or by the
2024  *      protocol layers.
2025  *
2026  *      return values:
2027  *      NET_RX_SUCCESS  (no congestion)
2028  *      NET_RX_DROP     (packet was dropped)
2029  *
2030  */
2031
2032 int netif_rx(struct sk_buff *skb)
2033 {
2034         struct softnet_data *queue;
2035         unsigned long flags;
2036
2037         /* if netpoll wants it, pretend we never saw it */
2038         if (netpoll_rx(skb))
2039                 return NET_RX_DROP;
2040
2041         if (!skb->tstamp.tv64)
2042                 net_timestamp(skb);
2043
2044         /*
2045          * The code is rearranged so that the path is the most
2046          * short when CPU is congested, but is still operating.
2047          */
2048         local_irq_save(flags);
2049         queue = &__get_cpu_var(softnet_data);
2050
2051         __get_cpu_var(netdev_rx_stat).total++;
2052         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2053                 if (queue->input_pkt_queue.qlen) {
2054 enqueue:
2055                         __skb_queue_tail(&queue->input_pkt_queue, skb);
2056                         local_irq_restore(flags);
2057                         return NET_RX_SUCCESS;
2058                 }
2059
2060                 napi_schedule(&queue->backlog);
2061                 goto enqueue;
2062         }
2063
2064         __get_cpu_var(netdev_rx_stat).dropped++;
2065         local_irq_restore(flags);
2066
2067         kfree_skb(skb);
2068         return NET_RX_DROP;
2069 }
2070 EXPORT_SYMBOL(netif_rx);
2071
2072 int netif_rx_ni(struct sk_buff *skb)
2073 {
2074         int err;
2075
2076         preempt_disable();
2077         err = netif_rx(skb);
2078         if (local_softirq_pending())
2079                 do_softirq();
2080         preempt_enable();
2081
2082         return err;
2083 }
2084 EXPORT_SYMBOL(netif_rx_ni);
2085
2086 static void net_tx_action(struct softirq_action *h)
2087 {
2088         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2089
2090         if (sd->completion_queue) {
2091                 struct sk_buff *clist;
2092
2093                 local_irq_disable();
2094                 clist = sd->completion_queue;
2095                 sd->completion_queue = NULL;
2096                 local_irq_enable();
2097
2098                 while (clist) {
2099                         struct sk_buff *skb = clist;
2100                         clist = clist->next;
2101
2102                         WARN_ON(atomic_read(&skb->users));
2103                         __kfree_skb(skb);
2104                 }
2105         }
2106
2107         if (sd->output_queue) {
2108                 struct Qdisc *head;
2109
2110                 local_irq_disable();
2111                 head = sd->output_queue;
2112                 sd->output_queue = NULL;
2113                 local_irq_enable();
2114
2115                 while (head) {
2116                         struct Qdisc *q = head;
2117                         spinlock_t *root_lock;
2118
2119                         head = head->next_sched;
2120
2121                         root_lock = qdisc_lock(q);
2122                         if (spin_trylock(root_lock)) {
2123                                 smp_mb__before_clear_bit();
2124                                 clear_bit(__QDISC_STATE_SCHED,
2125                                           &q->state);
2126                                 qdisc_run(q);
2127                                 spin_unlock(root_lock);
2128                         } else {
2129                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2130                                               &q->state)) {
2131                                         __netif_reschedule(q);
2132                                 } else {
2133                                         smp_mb__before_clear_bit();
2134                                         clear_bit(__QDISC_STATE_SCHED,
2135                                                   &q->state);
2136                                 }
2137                         }
2138                 }
2139         }
2140 }
2141
2142 static inline int deliver_skb(struct sk_buff *skb,
2143                               struct packet_type *pt_prev,
2144                               struct net_device *orig_dev)
2145 {
2146         atomic_inc(&skb->users);
2147         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2148 }
2149
2150 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2151
2152 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2153 /* This hook is defined here for ATM LANE */
2154 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2155                              unsigned char *addr) __read_mostly;
2156 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2157 #endif
2158
2159 /*
2160  * If bridge module is loaded call bridging hook.
2161  *  returns NULL if packet was consumed.
2162  */
2163 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2164                                         struct sk_buff *skb) __read_mostly;
2165 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2166
2167 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2168                                             struct packet_type **pt_prev, int *ret,
2169                                             struct net_device *orig_dev)
2170 {
2171         struct net_bridge_port *port;
2172
2173         if (skb->pkt_type == PACKET_LOOPBACK ||
2174             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2175                 return skb;
2176
2177         if (*pt_prev) {
2178                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2179                 *pt_prev = NULL;
2180         }
2181
2182         return br_handle_frame_hook(port, skb);
2183 }
2184 #else
2185 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2186 #endif
2187
2188 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2189 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2190 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2191
2192 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2193                                              struct packet_type **pt_prev,
2194                                              int *ret,
2195                                              struct net_device *orig_dev)
2196 {
2197         if (skb->dev->macvlan_port == NULL)
2198                 return skb;
2199
2200         if (*pt_prev) {
2201                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2202                 *pt_prev = NULL;
2203         }
2204         return macvlan_handle_frame_hook(skb);
2205 }
2206 #else
2207 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2208 #endif
2209
2210 #ifdef CONFIG_NET_CLS_ACT
2211 /* TODO: Maybe we should just force sch_ingress to be compiled in
2212  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2213  * a compare and 2 stores extra right now if we dont have it on
2214  * but have CONFIG_NET_CLS_ACT
2215  * NOTE: This doesnt stop any functionality; if you dont have
2216  * the ingress scheduler, you just cant add policies on ingress.
2217  *
2218  */
2219 static int ing_filter(struct sk_buff *skb)
2220 {
2221         struct net_device *dev = skb->dev;
2222         u32 ttl = G_TC_RTTL(skb->tc_verd);
2223         struct netdev_queue *rxq;
2224         int result = TC_ACT_OK;
2225         struct Qdisc *q;
2226
2227         if (MAX_RED_LOOP < ttl++) {
2228                 printk(KERN_WARNING
2229                        "Redir loop detected Dropping packet (%d->%d)\n",
2230                        skb->iif, dev->ifindex);
2231                 return TC_ACT_SHOT;
2232         }
2233
2234         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2235         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2236
2237         rxq = &dev->rx_queue;
2238
2239         q = rxq->qdisc;
2240         if (q != &noop_qdisc) {
2241                 spin_lock(qdisc_lock(q));
2242                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2243                         result = qdisc_enqueue_root(skb, q);
2244                 spin_unlock(qdisc_lock(q));
2245         }
2246
2247         return result;
2248 }
2249
2250 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2251                                          struct packet_type **pt_prev,
2252                                          int *ret, struct net_device *orig_dev)
2253 {
2254         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2255                 goto out;
2256
2257         if (*pt_prev) {
2258                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2259                 *pt_prev = NULL;
2260         } else {
2261                 /* Huh? Why does turning on AF_PACKET affect this? */
2262                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2263         }
2264
2265         switch (ing_filter(skb)) {
2266         case TC_ACT_SHOT:
2267         case TC_ACT_STOLEN:
2268                 kfree_skb(skb);
2269                 return NULL;
2270         }
2271
2272 out:
2273         skb->tc_verd = 0;
2274         return skb;
2275 }
2276 #endif
2277
2278 /*
2279  *      netif_nit_deliver - deliver received packets to network taps
2280  *      @skb: buffer
2281  *
2282  *      This function is used to deliver incoming packets to network
2283  *      taps. It should be used when the normal netif_receive_skb path
2284  *      is bypassed, for example because of VLAN acceleration.
2285  */
2286 void netif_nit_deliver(struct sk_buff *skb)
2287 {
2288         struct packet_type *ptype;
2289
2290         if (list_empty(&ptype_all))
2291                 return;
2292
2293         skb_reset_network_header(skb);
2294         skb_reset_transport_header(skb);
2295         skb->mac_len = skb->network_header - skb->mac_header;
2296
2297         rcu_read_lock();
2298         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2299                 if (!ptype->dev || ptype->dev == skb->dev)
2300                         deliver_skb(skb, ptype, skb->dev);
2301         }
2302         rcu_read_unlock();
2303 }
2304
2305 /**
2306  *      netif_receive_skb - process receive buffer from network
2307  *      @skb: buffer to process
2308  *
2309  *      netif_receive_skb() is the main receive data processing function.
2310  *      It always succeeds. The buffer may be dropped during processing
2311  *      for congestion control or by the protocol layers.
2312  *
2313  *      This function may only be called from softirq context and interrupts
2314  *      should be enabled.
2315  *
2316  *      Return values (usually ignored):
2317  *      NET_RX_SUCCESS: no congestion
2318  *      NET_RX_DROP: packet was dropped
2319  */
2320 int netif_receive_skb(struct sk_buff *skb)
2321 {
2322         struct packet_type *ptype, *pt_prev;
2323         struct net_device *orig_dev;
2324         struct net_device *null_or_orig;
2325         int ret = NET_RX_DROP;
2326         __be16 type;
2327
2328         if (!skb->tstamp.tv64)
2329                 net_timestamp(skb);
2330
2331         if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2332                 return NET_RX_SUCCESS;
2333
2334         /* if we've gotten here through NAPI, check netpoll */
2335         if (netpoll_receive_skb(skb))
2336                 return NET_RX_DROP;
2337
2338         if (!skb->iif)
2339                 skb->iif = skb->dev->ifindex;
2340
2341         null_or_orig = NULL;
2342         orig_dev = skb->dev;
2343         if (orig_dev->master) {
2344                 if (skb_bond_should_drop(skb))
2345                         null_or_orig = orig_dev; /* deliver only exact match */
2346                 else
2347                         skb->dev = orig_dev->master;
2348         }
2349
2350         __get_cpu_var(netdev_rx_stat).total++;
2351
2352         skb_reset_network_header(skb);
2353         skb_reset_transport_header(skb);
2354         skb->mac_len = skb->network_header - skb->mac_header;
2355
2356         pt_prev = NULL;
2357
2358         rcu_read_lock();
2359
2360 #ifdef CONFIG_NET_CLS_ACT
2361         if (skb->tc_verd & TC_NCLS) {
2362                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2363                 goto ncls;
2364         }
2365 #endif
2366
2367         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2368                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2369                     ptype->dev == orig_dev) {
2370                         if (pt_prev)
2371                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2372                         pt_prev = ptype;
2373                 }
2374         }
2375
2376 #ifdef CONFIG_NET_CLS_ACT
2377         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2378         if (!skb)
2379                 goto out;
2380 ncls:
2381 #endif
2382
2383         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2384         if (!skb)
2385                 goto out;
2386         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2387         if (!skb)
2388                 goto out;
2389
2390         type = skb->protocol;
2391         list_for_each_entry_rcu(ptype,
2392                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2393                 if (ptype->type == type &&
2394                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2395                      ptype->dev == orig_dev)) {
2396                         if (pt_prev)
2397                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2398                         pt_prev = ptype;
2399                 }
2400         }
2401
2402         if (pt_prev) {
2403                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2404         } else {
2405                 kfree_skb(skb);
2406                 /* Jamal, now you will not able to escape explaining
2407                  * me how you were going to use this. :-)
2408                  */
2409                 ret = NET_RX_DROP;
2410         }
2411
2412 out:
2413         rcu_read_unlock();
2414         return ret;
2415 }
2416 EXPORT_SYMBOL(netif_receive_skb);
2417
2418 /* Network device is going away, flush any packets still pending  */
2419 static void flush_backlog(void *arg)
2420 {
2421         struct net_device *dev = arg;
2422         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2423         struct sk_buff *skb, *tmp;
2424
2425         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2426                 if (skb->dev == dev) {
2427                         __skb_unlink(skb, &queue->input_pkt_queue);
2428                         kfree_skb(skb);
2429                 }
2430 }
2431
2432 static int napi_gro_complete(struct sk_buff *skb)
2433 {
2434         struct packet_type *ptype;
2435         __be16 type = skb->protocol;
2436         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2437         int err = -ENOENT;
2438
2439         if (NAPI_GRO_CB(skb)->count == 1) {
2440                 skb_shinfo(skb)->gso_size = 0;
2441                 goto out;
2442         }
2443
2444         rcu_read_lock();
2445         list_for_each_entry_rcu(ptype, head, list) {
2446                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2447                         continue;
2448
2449                 err = ptype->gro_complete(skb);
2450                 break;
2451         }
2452         rcu_read_unlock();
2453
2454         if (err) {
2455                 WARN_ON(&ptype->list == head);
2456                 kfree_skb(skb);
2457                 return NET_RX_SUCCESS;
2458         }
2459
2460 out:
2461         return netif_receive_skb(skb);
2462 }
2463
2464 void napi_gro_flush(struct napi_struct *napi)
2465 {
2466         struct sk_buff *skb, *next;
2467
2468         for (skb = napi->gro_list; skb; skb = next) {
2469                 next = skb->next;
2470                 skb->next = NULL;
2471                 napi_gro_complete(skb);
2472         }
2473
2474         napi->gro_count = 0;
2475         napi->gro_list = NULL;
2476 }
2477 EXPORT_SYMBOL(napi_gro_flush);
2478
2479 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2480 {
2481         struct sk_buff **pp = NULL;
2482         struct packet_type *ptype;
2483         __be16 type = skb->protocol;
2484         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2485         int same_flow;
2486         int mac_len;
2487         enum gro_result ret;
2488
2489         if (!(skb->dev->features & NETIF_F_GRO))
2490                 goto normal;
2491
2492         if (skb_is_gso(skb) || skb_has_frags(skb))
2493                 goto normal;
2494
2495         rcu_read_lock();
2496         list_for_each_entry_rcu(ptype, head, list) {
2497                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2498                         continue;
2499
2500                 skb_set_network_header(skb, skb_gro_offset(skb));
2501                 mac_len = skb->network_header - skb->mac_header;
2502                 skb->mac_len = mac_len;
2503                 NAPI_GRO_CB(skb)->same_flow = 0;
2504                 NAPI_GRO_CB(skb)->flush = 0;
2505                 NAPI_GRO_CB(skb)->free = 0;
2506
2507                 pp = ptype->gro_receive(&napi->gro_list, skb);
2508                 break;
2509         }
2510         rcu_read_unlock();
2511
2512         if (&ptype->list == head)
2513                 goto normal;
2514
2515         same_flow = NAPI_GRO_CB(skb)->same_flow;
2516         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2517
2518         if (pp) {
2519                 struct sk_buff *nskb = *pp;
2520
2521                 *pp = nskb->next;
2522                 nskb->next = NULL;
2523                 napi_gro_complete(nskb);
2524                 napi->gro_count--;
2525         }
2526
2527         if (same_flow)
2528                 goto ok;
2529
2530         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2531                 goto normal;
2532
2533         napi->gro_count++;
2534         NAPI_GRO_CB(skb)->count = 1;
2535         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2536         skb->next = napi->gro_list;
2537         napi->gro_list = skb;
2538         ret = GRO_HELD;
2539
2540 pull:
2541         if (skb_headlen(skb) < skb_gro_offset(skb)) {
2542                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2543
2544                 BUG_ON(skb->end - skb->tail < grow);
2545
2546                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2547
2548                 skb->tail += grow;
2549                 skb->data_len -= grow;
2550
2551                 skb_shinfo(skb)->frags[0].page_offset += grow;
2552                 skb_shinfo(skb)->frags[0].size -= grow;
2553
2554                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2555                         put_page(skb_shinfo(skb)->frags[0].page);
2556                         memmove(skb_shinfo(skb)->frags,
2557                                 skb_shinfo(skb)->frags + 1,
2558                                 --skb_shinfo(skb)->nr_frags);
2559                 }
2560         }
2561
2562 ok:
2563         return ret;
2564
2565 normal:
2566         ret = GRO_NORMAL;
2567         goto pull;
2568 }
2569 EXPORT_SYMBOL(dev_gro_receive);
2570
2571 static gro_result_t
2572 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2573 {
2574         struct sk_buff *p;
2575
2576         if (netpoll_rx_on(skb))
2577                 return GRO_NORMAL;
2578
2579         for (p = napi->gro_list; p; p = p->next) {
2580                 NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2581                         && !compare_ether_header(skb_mac_header(p),
2582                                                  skb_gro_mac_header(skb));
2583                 NAPI_GRO_CB(p)->flush = 0;
2584         }
2585
2586         return dev_gro_receive(napi, skb);
2587 }
2588
2589 int napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
2590 {
2591         int err = NET_RX_SUCCESS;
2592
2593         switch (ret) {
2594         case GRO_NORMAL:
2595                 return netif_receive_skb(skb);
2596
2597         case GRO_DROP:
2598                 err = NET_RX_DROP;
2599                 /* fall through */
2600
2601         case GRO_MERGED_FREE:
2602                 kfree_skb(skb);
2603                 break;
2604
2605         case GRO_HELD:
2606         case GRO_MERGED:
2607                 break;
2608         }
2609
2610         return err;
2611 }
2612 EXPORT_SYMBOL(napi_skb_finish);
2613
2614 void skb_gro_reset_offset(struct sk_buff *skb)
2615 {
2616         NAPI_GRO_CB(skb)->data_offset = 0;
2617         NAPI_GRO_CB(skb)->frag0 = NULL;
2618         NAPI_GRO_CB(skb)->frag0_len = 0;
2619
2620         if (skb->mac_header == skb->tail &&
2621             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2622                 NAPI_GRO_CB(skb)->frag0 =
2623                         page_address(skb_shinfo(skb)->frags[0].page) +
2624                         skb_shinfo(skb)->frags[0].page_offset;
2625                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2626         }
2627 }
2628 EXPORT_SYMBOL(skb_gro_reset_offset);
2629
2630 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2631 {
2632         skb_gro_reset_offset(skb);
2633
2634         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2635 }
2636 EXPORT_SYMBOL(napi_gro_receive);
2637
2638 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2639 {
2640         __skb_pull(skb, skb_headlen(skb));
2641         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2642
2643         napi->skb = skb;
2644 }
2645 EXPORT_SYMBOL(napi_reuse_skb);
2646
2647 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2648 {
2649         struct sk_buff *skb = napi->skb;
2650
2651         if (!skb) {
2652                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
2653                 if (skb)
2654                         napi->skb = skb;
2655         }
2656         return skb;
2657 }
2658 EXPORT_SYMBOL(napi_get_frags);
2659
2660 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2661                       gro_result_t ret)
2662 {
2663         int err = NET_RX_SUCCESS;
2664
2665         switch (ret) {
2666         case GRO_NORMAL:
2667         case GRO_HELD:
2668                 skb->protocol = eth_type_trans(skb, napi->dev);
2669
2670                 if (ret == GRO_NORMAL)
2671                         return netif_receive_skb(skb);
2672
2673                 skb_gro_pull(skb, -ETH_HLEN);
2674                 break;
2675
2676         case GRO_DROP:
2677                 err = NET_RX_DROP;
2678                 /* fall through */
2679
2680         case GRO_MERGED_FREE:
2681                 napi_reuse_skb(napi, skb);
2682                 break;
2683
2684         case GRO_MERGED:
2685                 break;
2686         }
2687
2688         return err;
2689 }
2690 EXPORT_SYMBOL(napi_frags_finish);
2691
2692 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2693 {
2694         struct sk_buff *skb = napi->skb;
2695         struct ethhdr *eth;
2696         unsigned int hlen;
2697         unsigned int off;
2698
2699         napi->skb = NULL;
2700
2701         skb_reset_mac_header(skb);
2702         skb_gro_reset_offset(skb);
2703
2704         off = skb_gro_offset(skb);
2705         hlen = off + sizeof(*eth);
2706         eth = skb_gro_header_fast(skb, off);
2707         if (skb_gro_header_hard(skb, hlen)) {
2708                 eth = skb_gro_header_slow(skb, hlen, off);
2709                 if (unlikely(!eth)) {
2710                         napi_reuse_skb(napi, skb);
2711                         skb = NULL;
2712                         goto out;
2713                 }
2714         }
2715
2716         skb_gro_pull(skb, sizeof(*eth));
2717
2718         /*
2719          * This works because the only protocols we care about don't require
2720          * special handling.  We'll fix it up properly at the end.
2721          */
2722         skb->protocol = eth->h_proto;
2723
2724 out:
2725         return skb;
2726 }
2727 EXPORT_SYMBOL(napi_frags_skb);
2728
2729 int napi_gro_frags(struct napi_struct *napi)
2730 {
2731         struct sk_buff *skb = napi_frags_skb(napi);
2732
2733         if (!skb)
2734                 return NET_RX_DROP;
2735
2736         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2737 }
2738 EXPORT_SYMBOL(napi_gro_frags);
2739
2740 static int process_backlog(struct napi_struct *napi, int quota)
2741 {
2742         int work = 0;
2743         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2744         unsigned long start_time = jiffies;
2745
2746         napi->weight = weight_p;
2747         do {
2748                 struct sk_buff *skb;
2749
2750                 local_irq_disable();
2751                 skb = __skb_dequeue(&queue->input_pkt_queue);
2752                 if (!skb) {
2753                         __napi_complete(napi);
2754                         local_irq_enable();
2755                         break;
2756                 }
2757                 local_irq_enable();
2758
2759                 netif_receive_skb(skb);
2760         } while (++work < quota && jiffies == start_time);
2761
2762         return work;
2763 }
2764
2765 /**
2766  * __napi_schedule - schedule for receive
2767  * @n: entry to schedule
2768  *
2769  * The entry's receive function will be scheduled to run
2770  */
2771 void __napi_schedule(struct napi_struct *n)
2772 {
2773         unsigned long flags;
2774
2775         local_irq_save(flags);
2776         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2777         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2778         local_irq_restore(flags);
2779 }
2780 EXPORT_SYMBOL(__napi_schedule);
2781
2782 void __napi_complete(struct napi_struct *n)
2783 {
2784         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2785         BUG_ON(n->gro_list);
2786
2787         list_del(&n->poll_list);
2788         smp_mb__before_clear_bit();
2789         clear_bit(NAPI_STATE_SCHED, &n->state);
2790 }
2791 EXPORT_SYMBOL(__napi_complete);
2792
2793 void napi_complete(struct napi_struct *n)
2794 {
2795         unsigned long flags;
2796
2797         /*
2798          * don't let napi dequeue from the cpu poll list
2799          * just in case its running on a different cpu
2800          */
2801         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2802                 return;
2803
2804         napi_gro_flush(n);
2805         local_irq_save(flags);
2806         __napi_complete(n);
2807         local_irq_restore(flags);
2808 }
2809 EXPORT_SYMBOL(napi_complete);
2810
2811 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2812                     int (*poll)(struct napi_struct *, int), int weight)
2813 {
2814         INIT_LIST_HEAD(&napi->poll_list);
2815         napi->gro_count = 0;
2816         napi->gro_list = NULL;
2817         napi->skb = NULL;
2818         napi->poll = poll;
2819         napi->weight = weight;
2820         list_add(&napi->dev_list, &dev->napi_list);
2821         napi->dev = dev;
2822 #ifdef CONFIG_NETPOLL
2823         spin_lock_init(&napi->poll_lock);
2824         napi->poll_owner = -1;
2825 #endif
2826         set_bit(NAPI_STATE_SCHED, &napi->state);
2827 }
2828 EXPORT_SYMBOL(netif_napi_add);
2829
2830 void netif_napi_del(struct napi_struct *napi)
2831 {
2832         struct sk_buff *skb, *next;
2833
2834         list_del_init(&napi->dev_list);
2835         napi_free_frags(napi);
2836
2837         for (skb = napi->gro_list; skb; skb = next) {
2838                 next = skb->next;
2839                 skb->next = NULL;
2840                 kfree_skb(skb);
2841         }
2842
2843         napi->gro_list = NULL;
2844         napi->gro_count = 0;
2845 }
2846 EXPORT_SYMBOL(netif_napi_del);
2847
2848
2849 static void net_rx_action(struct softirq_action *h)
2850 {
2851         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2852         unsigned long time_limit = jiffies + 2;
2853         int budget = netdev_budget;
2854         void *have;
2855
2856         local_irq_disable();
2857
2858         while (!list_empty(list)) {
2859                 struct napi_struct *n;
2860                 int work, weight;
2861
2862                 /* If softirq window is exhuasted then punt.
2863                  * Allow this to run for 2 jiffies since which will allow
2864                  * an average latency of 1.5/HZ.
2865                  */
2866                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2867                         goto softnet_break;
2868
2869                 local_irq_enable();
2870
2871                 /* Even though interrupts have been re-enabled, this
2872                  * access is safe because interrupts can only add new
2873                  * entries to the tail of this list, and only ->poll()
2874                  * calls can remove this head entry from the list.
2875                  */
2876                 n = list_entry(list->next, struct napi_struct, poll_list);
2877
2878                 have = netpoll_poll_lock(n);
2879
2880                 weight = n->weight;
2881
2882                 /* This NAPI_STATE_SCHED test is for avoiding a race
2883                  * with netpoll's poll_napi().  Only the entity which
2884                  * obtains the lock and sees NAPI_STATE_SCHED set will
2885                  * actually make the ->poll() call.  Therefore we avoid
2886                  * accidently calling ->poll() when NAPI is not scheduled.
2887                  */
2888                 work = 0;
2889                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
2890                         work = n->poll(n, weight);
2891                         trace_napi_poll(n);
2892                 }
2893
2894                 WARN_ON_ONCE(work > weight);
2895
2896                 budget -= work;
2897
2898                 local_irq_disable();
2899
2900                 /* Drivers must not modify the NAPI state if they
2901                  * consume the entire weight.  In such cases this code
2902                  * still "owns" the NAPI instance and therefore can
2903                  * move the instance around on the list at-will.
2904                  */
2905                 if (unlikely(work == weight)) {
2906                         if (unlikely(napi_disable_pending(n))) {
2907                                 local_irq_enable();
2908                                 napi_complete(n);
2909                                 local_irq_disable();
2910                         } else
2911                                 list_move_tail(&n->poll_list, list);
2912                 }
2913
2914                 netpoll_poll_unlock(have);
2915         }
2916 out:
2917         local_irq_enable();
2918
2919 #ifdef CONFIG_NET_DMA
2920         /*
2921          * There may not be any more sk_buffs coming right now, so push
2922          * any pending DMA copies to hardware
2923          */
2924         dma_issue_pending_all();
2925 #endif
2926
2927         return;
2928
2929 softnet_break:
2930         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2931         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2932         goto out;
2933 }
2934
2935 static gifconf_func_t *gifconf_list[NPROTO];
2936
2937 /**
2938  *      register_gifconf        -       register a SIOCGIF handler
2939  *      @family: Address family
2940  *      @gifconf: Function handler
2941  *
2942  *      Register protocol dependent address dumping routines. The handler
2943  *      that is passed must not be freed or reused until it has been replaced
2944  *      by another handler.
2945  */
2946 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
2947 {
2948         if (family >= NPROTO)
2949                 return -EINVAL;
2950         gifconf_list[family] = gifconf;
2951         return 0;
2952 }
2953 EXPORT_SYMBOL(register_gifconf);
2954
2955
2956 /*
2957  *      Map an interface index to its name (SIOCGIFNAME)
2958  */
2959
2960 /*
2961  *      We need this ioctl for efficient implementation of the
2962  *      if_indextoname() function required by the IPv6 API.  Without
2963  *      it, we would have to search all the interfaces to find a
2964  *      match.  --pb
2965  */
2966
2967 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2968 {
2969         struct net_device *dev;
2970         struct ifreq ifr;
2971
2972         /*
2973          *      Fetch the caller's info block.
2974          */
2975
2976         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2977                 return -EFAULT;
2978
2979         rcu_read_lock();
2980         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
2981         if (!dev) {
2982                 rcu_read_unlock();
2983                 return -ENODEV;
2984         }
2985
2986         strcpy(ifr.ifr_name, dev->name);
2987         rcu_read_unlock();
2988
2989         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2990                 return -EFAULT;
2991         return 0;
2992 }
2993
2994 /*
2995  *      Perform a SIOCGIFCONF call. This structure will change
2996  *      size eventually, and there is nothing I can do about it.
2997  *      Thus we will need a 'compatibility mode'.
2998  */
2999
3000 static int dev_ifconf(struct net *net, char __user *arg)
3001 {
3002         struct ifconf ifc;
3003         struct net_device *dev;
3004         char __user *pos;
3005         int len;
3006         int total;
3007         int i;
3008
3009         /*
3010          *      Fetch the caller's info block.
3011          */
3012
3013         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3014                 return -EFAULT;
3015
3016         pos = ifc.ifc_buf;
3017         len = ifc.ifc_len;
3018
3019         /*
3020          *      Loop over the interfaces, and write an info block for each.
3021          */
3022
3023         total = 0;
3024         for_each_netdev(net, dev) {
3025                 for (i = 0; i < NPROTO; i++) {
3026                         if (gifconf_list[i]) {
3027                                 int done;
3028                                 if (!pos)
3029                                         done = gifconf_list[i](dev, NULL, 0);
3030                                 else
3031                                         done = gifconf_list[i](dev, pos + total,
3032                                                                len - total);
3033                                 if (done < 0)
3034                                         return -EFAULT;
3035                                 total += done;
3036                         }
3037                 }
3038         }
3039
3040         /*
3041          *      All done.  Write the updated control block back to the caller.
3042          */
3043         ifc.ifc_len = total;
3044
3045         /*
3046          *      Both BSD and Solaris return 0 here, so we do too.
3047          */
3048         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3049 }
3050
3051 #ifdef CONFIG_PROC_FS
3052 /*
3053  *      This is invoked by the /proc filesystem handler to display a device
3054  *      in detail.
3055  */
3056 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3057         __acquires(dev_base_lock)
3058 {
3059         struct net *net = seq_file_net(seq);
3060         loff_t off;
3061         struct net_device *dev;
3062
3063         read_lock(&dev_base_lock);
3064         if (!*pos)
3065                 return SEQ_START_TOKEN;
3066
3067         off = 1;
3068         for_each_netdev(net, dev)
3069                 if (off++ == *pos)
3070                         return dev;
3071
3072         return NULL;
3073 }
3074
3075 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3076 {
3077         struct net *net = seq_file_net(seq);
3078         ++*pos;
3079         return v == SEQ_START_TOKEN ?
3080                 first_net_device(net) : next_net_device((struct net_device *)v);
3081 }
3082
3083 void dev_seq_stop(struct seq_file *seq, void *v)
3084         __releases(dev_base_lock)
3085 {
3086         read_unlock(&dev_base_lock);
3087 }
3088
3089 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3090 {
3091         const struct net_device_stats *stats = dev_get_stats(dev);
3092
3093         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3094                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3095                    dev->name, stats->rx_bytes, stats->rx_packets,
3096                    stats->rx_errors,
3097                    stats->rx_dropped + stats->rx_missed_errors,
3098                    stats->rx_fifo_errors,
3099                    stats->rx_length_errors + stats->rx_over_errors +
3100                     stats->rx_crc_errors + stats->rx_frame_errors,
3101                    stats->rx_compressed, stats->multicast,
3102                    stats->tx_bytes, stats->tx_packets,
3103                    stats->tx_errors, stats->tx_dropped,
3104                    stats->tx_fifo_errors, stats->collisions,
3105                    stats->tx_carrier_errors +
3106                     stats->tx_aborted_errors +
3107                     stats->tx_window_errors +
3108                     stats->tx_heartbeat_errors,
3109                    stats->tx_compressed);
3110 }
3111
3112 /*
3113  *      Called from the PROCfs module. This now uses the new arbitrary sized
3114  *      /proc/net interface to create /proc/net/dev
3115  */
3116 static int dev_seq_show(struct seq_file *seq, void *v)
3117 {
3118         if (v == SEQ_START_TOKEN)
3119                 seq_puts(seq, "Inter-|   Receive                            "
3120                               "                    |  Transmit\n"
3121                               " face |bytes    packets errs drop fifo frame "
3122                               "compressed multicast|bytes    packets errs "
3123                               "drop fifo colls carrier compressed\n");
3124         else
3125                 dev_seq_printf_stats(seq, v);
3126         return 0;
3127 }
3128
3129 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3130 {
3131         struct netif_rx_stats *rc = NULL;
3132
3133         while (*pos < nr_cpu_ids)
3134                 if (cpu_online(*pos)) {
3135                         rc = &per_cpu(netdev_rx_stat, *pos);
3136                         break;
3137                 } else
3138                         ++*pos;
3139         return rc;
3140 }
3141
3142 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3143 {
3144         return softnet_get_online(pos);
3145 }
3146
3147 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3148 {
3149         ++*pos;
3150         return softnet_get_online(pos);
3151 }
3152
3153 static void softnet_seq_stop(struct seq_file *seq, void *v)
3154 {
3155 }
3156
3157 static int softnet_seq_show(struct seq_file *seq, void *v)
3158 {
3159         struct netif_rx_stats *s = v;
3160
3161         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3162                    s->total, s->dropped, s->time_squeeze, 0,
3163                    0, 0, 0, 0, /* was fastroute */
3164                    s->cpu_collision);
3165         return 0;
3166 }
3167
3168 static const struct seq_operations dev_seq_ops = {
3169         .start = dev_seq_start,
3170         .next  = dev_seq_next,
3171         .stop  = dev_seq_stop,
3172         .show  = dev_seq_show,
3173 };
3174
3175 static int dev_seq_open(struct inode *inode, struct file *file)
3176 {
3177         return seq_open_net(inode, file, &dev_seq_ops,
3178                             sizeof(struct seq_net_private));
3179 }
3180
3181 static const struct file_operations dev_seq_fops = {
3182         .owner   = THIS_MODULE,
3183         .open    = dev_seq_open,
3184         .read    = seq_read,
3185         .llseek  = seq_lseek,
3186         .release = seq_release_net,
3187 };
3188
3189 static const struct seq_operations softnet_seq_ops = {
3190         .start = softnet_seq_start,
3191         .next  = softnet_seq_next,
3192         .stop  = softnet_seq_stop,
3193         .show  = softnet_seq_show,
3194 };
3195
3196 static int softnet_seq_open(struct inode *inode, struct file *file)
3197 {
3198         return seq_open(file, &softnet_seq_ops);
3199 }
3200
3201 static const struct file_operations softnet_seq_fops = {
3202         .owner   = THIS_MODULE,
3203         .open    = softnet_seq_open,
3204         .read    = seq_read,
3205         .llseek  = seq_lseek,
3206         .release = seq_release,
3207 };
3208
3209 static void *ptype_get_idx(loff_t pos)
3210 {
3211         struct packet_type *pt = NULL;
3212         loff_t i = 0;
3213         int t;
3214
3215         list_for_each_entry_rcu(pt, &ptype_all, list) {
3216                 if (i == pos)
3217                         return pt;
3218                 ++i;
3219         }
3220
3221         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3222                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3223                         if (i == pos)
3224                                 return pt;
3225                         ++i;
3226                 }
3227         }
3228         return NULL;
3229 }
3230
3231 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3232         __acquires(RCU)
3233 {
3234         rcu_read_lock();
3235         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3236 }
3237
3238 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3239 {
3240         struct packet_type *pt;
3241         struct list_head *nxt;
3242         int hash;
3243
3244         ++*pos;
3245         if (v == SEQ_START_TOKEN)
3246                 return ptype_get_idx(0);
3247
3248         pt = v;
3249         nxt = pt->list.next;
3250         if (pt->type == htons(ETH_P_ALL)) {
3251                 if (nxt != &ptype_all)
3252                         goto found;
3253                 hash = 0;
3254                 nxt = ptype_base[0].next;
3255         } else
3256                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3257
3258         while (nxt == &ptype_base[hash]) {
3259                 if (++hash >= PTYPE_HASH_SIZE)
3260                         return NULL;
3261                 nxt = ptype_base[hash].next;
3262         }
3263 found:
3264         return list_entry(nxt, struct packet_type, list);
3265 }
3266
3267 static void ptype_seq_stop(struct seq_file *seq, void *v)
3268         __releases(RCU)
3269 {
3270         rcu_read_unlock();
3271 }
3272
3273 static int ptype_seq_show(struct seq_file *seq, void *v)
3274 {
3275         struct packet_type *pt = v;
3276
3277         if (v == SEQ_START_TOKEN)
3278                 seq_puts(seq, "Type Device      Function\n");
3279         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3280                 if (pt->type == htons(ETH_P_ALL))
3281                         seq_puts(seq, "ALL ");
3282                 else
3283                         seq_printf(seq, "%04x", ntohs(pt->type));
3284
3285                 seq_printf(seq, " %-8s %pF\n",
3286                            pt->dev ? pt->dev->name : "", pt->func);
3287         }
3288
3289         return 0;
3290 }
3291
3292 static const struct seq_operations ptype_seq_ops = {
3293         .start = ptype_seq_start,
3294         .next  = ptype_seq_next,
3295         .stop  = ptype_seq_stop,
3296         .show  = ptype_seq_show,
3297 };
3298
3299 static int ptype_seq_open(struct inode *inode, struct file *file)
3300 {
3301         return seq_open_net(inode, file, &ptype_seq_ops,
3302                         sizeof(struct seq_net_private));
3303 }
3304
3305 static const struct file_operations ptype_seq_fops = {
3306         .owner   = THIS_MODULE,
3307         .open    = ptype_seq_open,
3308         .read    = seq_read,
3309         .llseek  = seq_lseek,
3310         .release = seq_release_net,
3311 };
3312
3313
3314 static int __net_init dev_proc_net_init(struct net *net)
3315 {
3316         int rc = -ENOMEM;
3317
3318         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3319                 goto out;
3320         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3321                 goto out_dev;
3322         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3323                 goto out_softnet;
3324
3325         if (wext_proc_init(net))
3326                 goto out_ptype;
3327         rc = 0;
3328 out:
3329         return rc;
3330 out_ptype:
3331         proc_net_remove(net, "ptype");
3332 out_softnet:
3333         proc_net_remove(net, "softnet_stat");
3334 out_dev:
3335         proc_net_remove(net, "dev");
3336         goto out;
3337 }
3338
3339 static void __net_exit dev_proc_net_exit(struct net *net)
3340 {
3341         wext_proc_exit(net);
3342
3343         proc_net_remove(net, "ptype");
3344         proc_net_remove(net, "softnet_stat");
3345         proc_net_remove(net, "dev");
3346 }
3347
3348 static struct pernet_operations __net_initdata dev_proc_ops = {
3349         .init = dev_proc_net_init,
3350         .exit = dev_proc_net_exit,
3351 };
3352
3353 static int __init dev_proc_init(void)
3354 {
3355         return register_pernet_subsys(&dev_proc_ops);
3356 }
3357 #else
3358 #define dev_proc_init() 0
3359 #endif  /* CONFIG_PROC_FS */
3360
3361
3362 /**
3363  *      netdev_set_master       -       set up master/slave pair
3364  *      @slave: slave device
3365  *      @master: new master device
3366  *
3367  *      Changes the master device of the slave. Pass %NULL to break the
3368  *      bonding. The caller must hold the RTNL semaphore. On a failure
3369  *      a negative errno code is returned. On success the reference counts
3370  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3371  *      function returns zero.
3372  */
3373 int netdev_set_master(struct net_device *slave, struct net_device *master)
3374 {
3375         struct net_device *old = slave->master;
3376
3377         ASSERT_RTNL();
3378
3379         if (master) {
3380                 if (old)
3381                         return -EBUSY;
3382                 dev_hold(master);
3383         }
3384
3385         slave->master = master;
3386
3387         synchronize_net();
3388
3389         if (old)
3390                 dev_put(old);
3391
3392         if (master)
3393                 slave->flags |= IFF_SLAVE;
3394         else
3395                 slave->flags &= ~IFF_SLAVE;
3396
3397         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3398         return 0;
3399 }
3400 EXPORT_SYMBOL(netdev_set_master);
3401
3402 static void dev_change_rx_flags(struct net_device *dev, int flags)
3403 {
3404         const struct net_device_ops *ops = dev->netdev_ops;
3405
3406         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3407                 ops->ndo_change_rx_flags(dev, flags);
3408 }
3409
3410 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3411 {
3412         unsigned short old_flags = dev->flags;
3413         uid_t uid;
3414         gid_t gid;
3415
3416         ASSERT_RTNL();
3417
3418         dev->flags |= IFF_PROMISC;
3419         dev->promiscuity += inc;
3420         if (dev->promiscuity == 0) {
3421                 /*
3422                  * Avoid overflow.
3423                  * If inc causes overflow, untouch promisc and return error.
3424                  */
3425                 if (inc < 0)
3426                         dev->flags &= ~IFF_PROMISC;
3427                 else {
3428                         dev->promiscuity -= inc;
3429                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3430                                 "set promiscuity failed, promiscuity feature "
3431                                 "of device might be broken.\n", dev->name);
3432                         return -EOVERFLOW;
3433                 }
3434         }
3435         if (dev->flags != old_flags) {
3436                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3437                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3438                                                                "left");
3439                 if (audit_enabled) {
3440                         current_uid_gid(&uid, &gid);
3441                         audit_log(current->audit_context, GFP_ATOMIC,
3442                                 AUDIT_ANOM_PROMISCUOUS,
3443                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3444                                 dev->name, (dev->flags & IFF_PROMISC),
3445                                 (old_flags & IFF_PROMISC),
3446                                 audit_get_loginuid(current),
3447                                 uid, gid,
3448                                 audit_get_sessionid(current));
3449                 }
3450
3451                 dev_change_rx_flags(dev, IFF_PROMISC);
3452         }
3453         return 0;
3454 }
3455
3456 /**
3457  *      dev_set_promiscuity     - update promiscuity count on a device
3458  *      @dev: device
3459  *      @inc: modifier
3460  *
3461  *      Add or remove promiscuity from a device. While the count in the device
3462  *      remains above zero the interface remains promiscuous. Once it hits zero
3463  *      the device reverts back to normal filtering operation. A negative inc
3464  *      value is used to drop promiscuity on the device.
3465  *      Return 0 if successful or a negative errno code on error.
3466  */
3467 int dev_set_promiscuity(struct net_device *dev, int inc)
3468 {
3469         unsigned short old_flags = dev->flags;
3470         int err;
3471
3472         err = __dev_set_promiscuity(dev, inc);
3473         if (err < 0)
3474                 return err;
3475         if (dev->flags != old_flags)
3476                 dev_set_rx_mode(dev);
3477         return err;
3478 }
3479 EXPORT_SYMBOL(dev_set_promiscuity);
3480
3481 /**
3482  *      dev_set_allmulti        - update allmulti count on a device
3483  *      @dev: device
3484  *      @inc: modifier
3485  *
3486  *      Add or remove reception of all multicast frames to a device. While the
3487  *      count in the device remains above zero the interface remains listening
3488  *      to all interfaces. Once it hits zero the device reverts back to normal
3489  *      filtering operation. A negative @inc value is used to drop the counter
3490  *      when releasing a resource needing all multicasts.
3491  *      Return 0 if successful or a negative errno code on error.
3492  */
3493
3494 int dev_set_allmulti(struct net_device *dev, int inc)
3495 {
3496         unsigned short old_flags = dev->flags;
3497
3498         ASSERT_RTNL();
3499
3500         dev->flags |= IFF_ALLMULTI;
3501         dev->allmulti += inc;
3502         if (dev->allmulti == 0) {
3503                 /*
3504                  * Avoid overflow.
3505                  * If inc causes overflow, untouch allmulti and return error.
3506                  */
3507                 if (inc < 0)
3508                         dev->flags &= ~IFF_ALLMULTI;
3509                 else {
3510                         dev->allmulti -= inc;
3511                         printk(KERN_WARNING "%s: allmulti touches roof, "
3512                                 "set allmulti failed, allmulti feature of "
3513                                 "device might be broken.\n", dev->name);
3514                         return -EOVERFLOW;
3515                 }
3516         }
3517         if (dev->flags ^ old_flags) {
3518                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3519                 dev_set_rx_mode(dev);
3520         }
3521         return 0;
3522 }
3523 EXPORT_SYMBOL(dev_set_allmulti);
3524
3525 /*
3526  *      Upload unicast and multicast address lists to device and
3527  *      configure RX filtering. When the device doesn't support unicast
3528  *      filtering it is put in promiscuous mode while unicast addresses
3529  *      are present.
3530  */
3531 void __dev_set_rx_mode(struct net_device *dev)
3532 {
3533         const struct net_device_ops *ops = dev->netdev_ops;
3534
3535         /* dev_open will call this function so the list will stay sane. */
3536         if (!(dev->flags&IFF_UP))
3537                 return;
3538
3539         if (!netif_device_present(dev))
3540                 return;
3541
3542         if (ops->ndo_set_rx_mode)
3543                 ops->ndo_set_rx_mode(dev);
3544         else {
3545                 /* Unicast addresses changes may only happen under the rtnl,
3546                  * therefore calling __dev_set_promiscuity here is safe.
3547                  */
3548                 if (dev->uc.count > 0 && !dev->uc_promisc) {
3549                         __dev_set_promiscuity(dev, 1);
3550                         dev->uc_promisc = 1;
3551                 } else if (dev->uc.count == 0 && dev->uc_promisc) {
3552                         __dev_set_promiscuity(dev, -1);
3553                         dev->uc_promisc = 0;
3554                 }
3555
3556                 if (ops->ndo_set_multicast_list)
3557                         ops->ndo_set_multicast_list(dev);
3558         }
3559 }
3560
3561 void dev_set_rx_mode(struct net_device *dev)
3562 {
3563         netif_addr_lock_bh(dev);
3564         __dev_set_rx_mode(dev);
3565         netif_addr_unlock_bh(dev);
3566 }
3567
3568 /* hw addresses list handling functions */
3569
3570 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3571                          int addr_len, unsigned char addr_type)
3572 {
3573         struct netdev_hw_addr *ha;
3574         int alloc_size;
3575
3576         if (addr_len > MAX_ADDR_LEN)
3577                 return -EINVAL;
3578
3579         list_for_each_entry(ha, &list->list, list) {
3580                 if (!memcmp(ha->addr, addr, addr_len) &&
3581                     ha->type == addr_type) {
3582                         ha->refcount++;
3583                         return 0;
3584                 }
3585         }
3586
3587
3588         alloc_size = sizeof(*ha);
3589         if (alloc_size < L1_CACHE_BYTES)
3590                 alloc_size = L1_CACHE_BYTES;
3591         ha = kmalloc(alloc_size, GFP_ATOMIC);
3592         if (!ha)
3593                 return -ENOMEM;
3594         memcpy(ha->addr, addr, addr_len);
3595         ha->type = addr_type;
3596         ha->refcount = 1;
3597         ha->synced = false;
3598         list_add_tail_rcu(&ha->list, &list->list);
3599         list->count++;
3600         return 0;
3601 }
3602
3603 static void ha_rcu_free(struct rcu_head *head)
3604 {
3605         struct netdev_hw_addr *ha;
3606
3607         ha = container_of(head, struct netdev_hw_addr, rcu_head);
3608         kfree(ha);
3609 }
3610
3611 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3612                          int addr_len, unsigned char addr_type)
3613 {
3614         struct netdev_hw_addr *ha;
3615
3616         list_for_each_entry(ha, &list->list, list) {
3617                 if (!memcmp(ha->addr, addr, addr_len) &&
3618                     (ha->type == addr_type || !addr_type)) {
3619                         if (--ha->refcount)
3620                                 return 0;
3621                         list_del_rcu(&ha->list);
3622                         call_rcu(&ha->rcu_head, ha_rcu_free);
3623                         list->count--;
3624                         return 0;
3625                 }
3626         }
3627         return -ENOENT;
3628 }
3629
3630 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3631                                   struct netdev_hw_addr_list *from_list,
3632                                   int addr_len,
3633                                   unsigned char addr_type)
3634 {
3635         int err;
3636         struct netdev_hw_addr *ha, *ha2;
3637         unsigned char type;
3638
3639         list_for_each_entry(ha, &from_list->list, list) {
3640                 type = addr_type ? addr_type : ha->type;
3641                 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3642                 if (err)
3643                         goto unroll;
3644         }
3645         return 0;
3646
3647 unroll:
3648         list_for_each_entry(ha2, &from_list->list, list) {
3649                 if (ha2 == ha)
3650                         break;
3651                 type = addr_type ? addr_type : ha2->type;
3652                 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3653         }
3654         return err;
3655 }
3656
3657 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3658                                    struct netdev_hw_addr_list *from_list,
3659                                    int addr_len,
3660                                    unsigned char addr_type)
3661 {
3662         struct netdev_hw_addr *ha;
3663         unsigned char type;
3664
3665         list_for_each_entry(ha, &from_list->list, list) {
3666                 type = addr_type ? addr_type : ha->type;
3667                 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3668         }
3669 }
3670
3671 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3672                           struct netdev_hw_addr_list *from_list,
3673                           int addr_len)
3674 {
3675         int err = 0;
3676         struct netdev_hw_addr *ha, *tmp;
3677
3678         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3679                 if (!ha->synced) {
3680                         err = __hw_addr_add(to_list, ha->addr,
3681                                             addr_len, ha->type);
3682                         if (err)
3683                                 break;
3684                         ha->synced = true;
3685                         ha->refcount++;
3686                 } else if (ha->refcount == 1) {
3687                         __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3688                         __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3689                 }
3690         }
3691         return err;
3692 }
3693
3694 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3695                              struct netdev_hw_addr_list *from_list,
3696                              int addr_len)
3697 {
3698         struct netdev_hw_addr *ha, *tmp;
3699
3700         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3701                 if (ha->synced) {
3702                         __hw_addr_del(to_list, ha->addr,
3703                                       addr_len, ha->type);
3704                         ha->synced = false;
3705                         __hw_addr_del(from_list, ha->addr,
3706                                       addr_len, ha->type);
3707                 }
3708         }
3709 }
3710
3711 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3712 {
3713         struct netdev_hw_addr *ha, *tmp;
3714
3715         list_for_each_entry_safe(ha, tmp, &list->list, list) {
3716                 list_del_rcu(&ha->list);
3717                 call_rcu(&ha->rcu_head, ha_rcu_free);
3718         }
3719         list->count = 0;
3720 }
3721
3722 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3723 {
3724         INIT_LIST_HEAD(&list->list);
3725         list->count = 0;
3726 }
3727
3728 /* Device addresses handling functions */
3729
3730 static void dev_addr_flush(struct net_device *dev)
3731 {
3732         /* rtnl_mutex must be held here */
3733
3734         __hw_addr_flush(&dev->dev_addrs);
3735         dev->dev_addr = NULL;
3736 }
3737
3738 static int dev_addr_init(struct net_device *dev)
3739 {
3740         unsigned char addr[MAX_ADDR_LEN];
3741         struct netdev_hw_addr *ha;
3742         int err;
3743
3744         /* rtnl_mutex must be held here */
3745
3746         __hw_addr_init(&dev->dev_addrs);
3747         memset(addr, 0, sizeof(addr));
3748         err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3749                             NETDEV_HW_ADDR_T_LAN);
3750         if (!err) {
3751                 /*
3752                  * Get the first (previously created) address from the list
3753                  * and set dev_addr pointer to this location.
3754                  */
3755                 ha = list_first_entry(&dev->dev_addrs.list,
3756                                       struct netdev_hw_addr, list);
3757                 dev->dev_addr = ha->addr;
3758         }
3759         return err;
3760 }
3761
3762 /**
3763  *      dev_addr_add    - Add a device address
3764  *      @dev: device
3765  *      @addr: address to add
3766  *      @addr_type: address type
3767  *
3768  *      Add a device address to the device or increase the reference count if
3769  *      it already exists.
3770  *
3771  *      The caller must hold the rtnl_mutex.
3772  */
3773 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3774                  unsigned char addr_type)
3775 {
3776         int err;
3777
3778         ASSERT_RTNL();
3779
3780         err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3781         if (!err)
3782                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3783         return err;
3784 }
3785 EXPORT_SYMBOL(dev_addr_add);
3786
3787 /**
3788  *      dev_addr_del    - Release a device address.
3789  *      @dev: device
3790  *      @addr: address to delete
3791  *      @addr_type: address type
3792  *
3793  *      Release reference to a device address and remove it from the device
3794  *      if the reference count drops to zero.
3795  *
3796  *      The caller must hold the rtnl_mutex.
3797  */
3798 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3799                  unsigned char addr_type)
3800 {
3801         int err;
3802         struct netdev_hw_addr *ha;
3803
3804         ASSERT_RTNL();
3805
3806         /*
3807          * We can not remove the first address from the list because
3808          * dev->dev_addr points to that.
3809          */
3810         ha = list_first_entry(&dev->dev_addrs.list,
3811                               struct netdev_hw_addr, list);
3812         if (ha->addr == dev->dev_addr && ha->refcount == 1)
3813                 return -ENOENT;
3814
3815         err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3816                             addr_type);
3817         if (!err)
3818                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3819         return err;
3820 }
3821 EXPORT_SYMBOL(dev_addr_del);
3822
3823 /**
3824  *      dev_addr_add_multiple   - Add device addresses from another device
3825  *      @to_dev: device to which addresses will be added
3826  *      @from_dev: device from which addresses will be added
3827  *      @addr_type: address type - 0 means type will be used from from_dev
3828  *
3829  *      Add device addresses of the one device to another.
3830  **
3831  *      The caller must hold the rtnl_mutex.
3832  */
3833 int dev_addr_add_multiple(struct net_device *to_dev,
3834                           struct net_device *from_dev,
3835                           unsigned char addr_type)
3836 {
3837         int err;
3838
3839         ASSERT_RTNL();
3840
3841         if (from_dev->addr_len != to_dev->addr_len)
3842                 return -EINVAL;
3843         err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3844                                      to_dev->addr_len, addr_type);
3845         if (!err)
3846                 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3847         return err;
3848 }
3849 EXPORT_SYMBOL(dev_addr_add_multiple);
3850
3851 /**
3852  *      dev_addr_del_multiple   - Delete device addresses by another device
3853  *      @to_dev: device where the addresses will be deleted
3854  *      @from_dev: device by which addresses the addresses will be deleted
3855  *      @addr_type: address type - 0 means type will used from from_dev
3856  *
3857  *      Deletes addresses in to device by the list of addresses in from device.
3858  *
3859  *      The caller must hold the rtnl_mutex.
3860  */
3861 int dev_addr_del_multiple(struct net_device *to_dev,
3862                           struct net_device *from_dev,
3863                           unsigned char addr_type)
3864 {
3865         ASSERT_RTNL();
3866
3867         if (from_dev->addr_len != to_dev->addr_len)
3868                 return -EINVAL;
3869         __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3870                                to_dev->addr_len, addr_type);
3871         call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3872         return 0;
3873 }
3874 EXPORT_SYMBOL(dev_addr_del_multiple);
3875
3876 /* multicast addresses handling functions */
3877
3878 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3879                       void *addr, int alen, int glbl)
3880 {
3881         struct dev_addr_list *da;
3882
3883         for (; (da = *list) != NULL; list = &da->next) {
3884                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3885                     alen == da->da_addrlen) {
3886                         if (glbl) {
3887                                 int old_glbl = da->da_gusers;
3888                                 da->da_gusers = 0;
3889                                 if (old_glbl == 0)
3890                                         break;
3891                         }
3892                         if (--da->da_users)
3893                                 return 0;
3894
3895                         *list = da->next;
3896                         kfree(da);
3897                         (*count)--;
3898                         return 0;
3899                 }
3900         }
3901         return -ENOENT;
3902 }
3903
3904 int __dev_addr_add(struct dev_addr_list **list, int *count,
3905                    void *addr, int alen, int glbl)
3906 {
3907         struct dev_addr_list *da;
3908
3909         for (da = *list; da != NULL; da = da->next) {
3910                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3911                     da->da_addrlen == alen) {
3912                         if (glbl) {
3913                                 int old_glbl = da->da_gusers;
3914                                 da->da_gusers = 1;
3915                                 if (old_glbl)
3916                                         return 0;
3917                         }
3918                         da->da_users++;
3919                         return 0;
3920                 }
3921         }
3922
3923         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3924         if (da == NULL)
3925                 return -ENOMEM;
3926         memcpy(da->da_addr, addr, alen);
3927         da->da_addrlen = alen;
3928         da->da_users = 1;
3929         da->da_gusers = glbl ? 1 : 0;
3930         da->next = *list;
3931         *list = da;
3932         (*count)++;
3933         return 0;
3934 }
3935
3936 /**
3937  *      dev_unicast_delete      - Release secondary unicast address.
3938  *      @dev: device
3939  *      @addr: address to delete
3940  *
3941  *      Release reference to a secondary unicast address and remove it
3942  *      from the device if the reference count drops to zero.
3943  *
3944  *      The caller must hold the rtnl_mutex.
3945  */
3946 int dev_unicast_delete(struct net_device *dev, void *addr)
3947 {
3948         int err;
3949
3950         ASSERT_RTNL();
3951
3952         netif_addr_lock_bh(dev);
3953         err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
3954                             NETDEV_HW_ADDR_T_UNICAST);
3955         if (!err)
3956                 __dev_set_rx_mode(dev);
3957         netif_addr_unlock_bh(dev);
3958         return err;
3959 }
3960 EXPORT_SYMBOL(dev_unicast_delete);
3961
3962 /**
3963  *      dev_unicast_add         - add a secondary unicast address
3964  *      @dev: device
3965  *      @addr: address to add
3966  *
3967  *      Add a secondary unicast address to the device or increase
3968  *      the reference count if it already exists.
3969  *
3970  *      The caller must hold the rtnl_mutex.
3971  */
3972 int dev_unicast_add(struct net_device *dev, void *addr)
3973 {
3974         int err;
3975
3976         ASSERT_RTNL();
3977
3978         netif_addr_lock_bh(dev);
3979         err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
3980                             NETDEV_HW_ADDR_T_UNICAST);
3981         if (!err)
3982                 __dev_set_rx_mode(dev);
3983         netif_addr_unlock_bh(dev);
3984         return err;
3985 }
3986 EXPORT_SYMBOL(dev_unicast_add);
3987
3988 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3989                     struct dev_addr_list **from, int *from_count)
3990 {
3991         struct dev_addr_list *da, *next;
3992         int err = 0;
3993
3994         da = *from;
3995         while (da != NULL) {
3996                 next = da->next;
3997                 if (!da->da_synced) {
3998                         err = __dev_addr_add(to, to_count,
3999                                              da->da_addr, da->da_addrlen, 0);
4000                         if (err < 0)
4001                                 break;
4002                         da->da_synced = 1;
4003                         da->da_users++;
4004                 } else if (da->da_users == 1) {
4005                         __dev_addr_delete(to, to_count,
4006                                           da->da_addr, da->da_addrlen, 0);
4007                         __dev_addr_delete(from, from_count,
4008                                           da->da_addr, da->da_addrlen, 0);
4009                 }
4010                 da = next;
4011         }
4012         return err;
4013 }
4014 EXPORT_SYMBOL_GPL(__dev_addr_sync);
4015
4016 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4017                        struct dev_addr_list **from, int *from_count)
4018 {
4019         struct dev_addr_list *da, *next;
4020
4021         da = *from;
4022         while (da != NULL) {
4023                 next = da->next;
4024                 if (da->da_synced) {
4025                         __dev_addr_delete(to, to_count,
4026                                           da->da_addr, da->da_addrlen, 0);
4027                         da->da_synced = 0;
4028                         __dev_addr_delete(from, from_count,
4029                                           da->da_addr, da->da_addrlen, 0);
4030                 }
4031                 da = next;
4032         }
4033 }
4034 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4035
4036 /**
4037  *      dev_unicast_sync - Synchronize device's unicast list to another device
4038  *      @to: destination device
4039  *      @from: source device
4040  *
4041  *      Add newly added addresses to the destination device and release
4042  *      addresses that have no users left. The source device must be
4043  *      locked by netif_tx_lock_bh.
4044  *
4045  *      This function is intended to be called from the dev->set_rx_mode
4046  *      function of layered software devices.
4047  */
4048 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4049 {
4050         int err = 0;
4051
4052         if (to->addr_len != from->addr_len)
4053                 return -EINVAL;
4054
4055         netif_addr_lock_bh(to);
4056         err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4057         if (!err)
4058                 __dev_set_rx_mode(to);
4059         netif_addr_unlock_bh(to);
4060         return err;
4061 }
4062 EXPORT_SYMBOL(dev_unicast_sync);
4063
4064 /**
4065  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4066  *      @to: destination device
4067  *      @from: source device
4068  *
4069  *      Remove all addresses that were added to the destination device by
4070  *      dev_unicast_sync(). This function is intended to be called from the
4071  *      dev->stop function of layered software devices.
4072  */
4073 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4074 {
4075         if (to->addr_len != from->addr_len)
4076                 return;
4077
4078         netif_addr_lock_bh(from);
4079         netif_addr_lock(to);
4080         __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4081         __dev_set_rx_mode(to);
4082         netif_addr_unlock(to);
4083         netif_addr_unlock_bh(from);
4084 }
4085 EXPORT_SYMBOL(dev_unicast_unsync);
4086
4087 static void dev_unicast_flush(struct net_device *dev)
4088 {
4089         netif_addr_lock_bh(dev);
4090         __hw_addr_flush(&dev->uc);
4091         netif_addr_unlock_bh(dev);
4092 }
4093
4094 static void dev_unicast_init(struct net_device *dev)
4095 {
4096         __hw_addr_init(&dev->uc);
4097 }
4098
4099
4100 static void __dev_addr_discard(struct dev_addr_list **list)
4101 {
4102         struct dev_addr_list *tmp;
4103
4104         while (*list != NULL) {
4105                 tmp = *list;
4106                 *list = tmp->next;
4107                 if (tmp->da_users > tmp->da_gusers)
4108                         printk("__dev_addr_discard: address leakage! "
4109                                "da_users=%d\n", tmp->da_users);
4110                 kfree(tmp);
4111         }
4112 }
4113
4114 static void dev_addr_discard(struct net_device *dev)
4115 {
4116         netif_addr_lock_bh(dev);
4117
4118         __dev_addr_discard(&dev->mc_list);
4119         dev->mc_count = 0;
4120
4121         netif_addr_unlock_bh(dev);
4122 }
4123
4124 /**
4125  *      dev_get_flags - get flags reported to userspace
4126  *      @dev: device
4127  *
4128  *      Get the combination of flag bits exported through APIs to userspace.
4129  */
4130 unsigned dev_get_flags(const struct net_device *dev)
4131 {
4132         unsigned flags;
4133
4134         flags = (dev->flags & ~(IFF_PROMISC |
4135                                 IFF_ALLMULTI |
4136                                 IFF_RUNNING |
4137                                 IFF_LOWER_UP |
4138                                 IFF_DORMANT)) |
4139                 (dev->gflags & (IFF_PROMISC |
4140                                 IFF_ALLMULTI));
4141
4142         if (netif_running(dev)) {
4143                 if (netif_oper_up(dev))
4144                         flags |= IFF_RUNNING;
4145                 if (netif_carrier_ok(dev))
4146                         flags |= IFF_LOWER_UP;
4147                 if (netif_dormant(dev))
4148                         flags |= IFF_DORMANT;
4149         }
4150
4151         return flags;
4152 }
4153 EXPORT_SYMBOL(dev_get_flags);
4154
4155 /**
4156  *      dev_change_flags - change device settings
4157  *      @dev: device
4158  *      @flags: device state flags
4159  *
4160  *      Change settings on device based state flags. The flags are
4161  *      in the userspace exported format.
4162  */
4163 int dev_change_flags(struct net_device *dev, unsigned flags)
4164 {
4165         int ret, changes;
4166         int old_flags = dev->flags;
4167
4168         ASSERT_RTNL();
4169
4170         /*
4171          *      Set the flags on our device.
4172          */
4173
4174         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4175                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4176                                IFF_AUTOMEDIA)) |
4177                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4178                                     IFF_ALLMULTI));
4179
4180         /*
4181          *      Load in the correct multicast list now the flags have changed.
4182          */
4183
4184         if ((old_flags ^ flags) & IFF_MULTICAST)
4185                 dev_change_rx_flags(dev, IFF_MULTICAST);
4186
4187         dev_set_rx_mode(dev);
4188
4189         /*
4190          *      Have we downed the interface. We handle IFF_UP ourselves
4191          *      according to user attempts to set it, rather than blindly
4192          *      setting it.
4193          */
4194
4195         ret = 0;
4196         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4197                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4198
4199                 if (!ret)
4200                         dev_set_rx_mode(dev);
4201         }
4202
4203         if (dev->flags & IFF_UP &&
4204             ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4205                                           IFF_VOLATILE)))
4206                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4207
4208         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4209                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4210
4211                 dev->gflags ^= IFF_PROMISC;
4212                 dev_set_promiscuity(dev, inc);
4213         }
4214
4215         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4216            is important. Some (broken) drivers set IFF_PROMISC, when
4217            IFF_ALLMULTI is requested not asking us and not reporting.
4218          */
4219         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4220                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4221
4222                 dev->gflags ^= IFF_ALLMULTI;
4223                 dev_set_allmulti(dev, inc);
4224         }
4225
4226         /* Exclude state transition flags, already notified */
4227         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4228         if (changes)
4229                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4230
4231         return ret;
4232 }
4233 EXPORT_SYMBOL(dev_change_flags);
4234
4235 /**
4236  *      dev_set_mtu - Change maximum transfer unit
4237  *      @dev: device
4238  *      @new_mtu: new transfer unit
4239  *
4240  *      Change the maximum transfer size of the network device.
4241  */
4242 int dev_set_mtu(struct net_device *dev, int new_mtu)
4243 {
4244         const struct net_device_ops *ops = dev->netdev_ops;
4245         int err;
4246
4247         if (new_mtu == dev->mtu)
4248                 return 0;
4249
4250         /*      MTU must be positive.    */
4251         if (new_mtu < 0)
4252                 return -EINVAL;
4253
4254         if (!netif_device_present(dev))
4255                 return -ENODEV;
4256
4257         err = 0;
4258         if (ops->ndo_change_mtu)
4259                 err = ops->ndo_change_mtu(dev, new_mtu);
4260         else
4261                 dev->mtu = new_mtu;
4262
4263         if (!err && dev->flags & IFF_UP)
4264                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4265         return err;
4266 }
4267 EXPORT_SYMBOL(dev_set_mtu);
4268
4269 /**
4270  *      dev_set_mac_address - Change Media Access Control Address
4271  *      @dev: device
4272  *      @sa: new address
4273  *
4274  *      Change the hardware (MAC) address of the device
4275  */
4276 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4277 {
4278         const struct net_device_ops *ops = dev->netdev_ops;
4279         int err;
4280
4281         if (!ops->ndo_set_mac_address)
4282                 return -EOPNOTSUPP;
4283         if (sa->sa_family != dev->type)
4284                 return -EINVAL;
4285         if (!netif_device_present(dev))
4286                 return -ENODEV;
4287         err = ops->ndo_set_mac_address(dev, sa);
4288         if (!err)
4289                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4290         return err;
4291 }
4292 EXPORT_SYMBOL(dev_set_mac_address);
4293
4294 /*
4295  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
4296  */
4297 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4298 {
4299         int err;
4300         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4301
4302         if (!dev)
4303                 return -ENODEV;
4304
4305         switch (cmd) {
4306         case SIOCGIFFLAGS:      /* Get interface flags */
4307                 ifr->ifr_flags = (short) dev_get_flags(dev);
4308                 return 0;
4309
4310         case SIOCGIFMETRIC:     /* Get the metric on the interface
4311                                    (currently unused) */
4312                 ifr->ifr_metric = 0;
4313                 return 0;
4314
4315         case SIOCGIFMTU:        /* Get the MTU of a device */
4316                 ifr->ifr_mtu = dev->mtu;
4317                 return 0;
4318
4319         case SIOCGIFHWADDR:
4320                 if (!dev->addr_len)
4321                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4322                 else
4323                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4324                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4325                 ifr->ifr_hwaddr.sa_family = dev->type;
4326                 return 0;
4327
4328         case SIOCGIFSLAVE:
4329                 err = -EINVAL;
4330                 break;
4331
4332         case SIOCGIFMAP:
4333                 ifr->ifr_map.mem_start = dev->mem_start;
4334                 ifr->ifr_map.mem_end   = dev->mem_end;
4335                 ifr->ifr_map.base_addr = dev->base_addr;
4336                 ifr->ifr_map.irq       = dev->irq;
4337                 ifr->ifr_map.dma       = dev->dma;
4338                 ifr->ifr_map.port      = dev->if_port;
4339                 return 0;
4340
4341         case SIOCGIFINDEX:
4342                 ifr->ifr_ifindex = dev->ifindex;
4343                 return 0;
4344
4345         case SIOCGIFTXQLEN:
4346                 ifr->ifr_qlen = dev->tx_queue_len;
4347                 return 0;
4348
4349         default:
4350                 /* dev_ioctl() should ensure this case
4351                  * is never reached
4352                  */
4353                 WARN_ON(1);
4354                 err = -EINVAL;
4355                 break;
4356
4357         }
4358         return err;
4359 }
4360
4361 /*
4362  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4363  */
4364 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4365 {
4366         int err;
4367         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4368         const struct net_device_ops *ops;
4369
4370         if (!dev)
4371                 return -ENODEV;
4372
4373         ops = dev->netdev_ops;
4374
4375         switch (cmd) {
4376         case SIOCSIFFLAGS:      /* Set interface flags */
4377                 return dev_change_flags(dev, ifr->ifr_flags);
4378
4379         case SIOCSIFMETRIC:     /* Set the metric on the interface
4380                                    (currently unused) */
4381                 return -EOPNOTSUPP;
4382
4383         case SIOCSIFMTU:        /* Set the MTU of a device */
4384                 return dev_set_mtu(dev, ifr->ifr_mtu);
4385
4386         case SIOCSIFHWADDR:
4387                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4388
4389         case SIOCSIFHWBROADCAST:
4390                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4391                         return -EINVAL;
4392                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4393                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4394                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4395                 return 0;
4396
4397         case SIOCSIFMAP:
4398                 if (ops->ndo_set_config) {
4399                         if (!netif_device_present(dev))
4400                                 return -ENODEV;
4401                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4402                 }
4403                 return -EOPNOTSUPP;
4404
4405         case SIOCADDMULTI:
4406                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4407                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4408                         return -EINVAL;
4409                 if (!netif_device_present(dev))
4410                         return -ENODEV;
4411                 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4412                                   dev->addr_len, 1);
4413
4414         case SIOCDELMULTI:
4415                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4416                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4417                         return -EINVAL;
4418                 if (!netif_device_present(dev))
4419                         return -ENODEV;
4420                 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4421                                      dev->addr_len, 1);
4422
4423         case SIOCSIFTXQLEN:
4424                 if (ifr->ifr_qlen < 0)
4425                         return -EINVAL;
4426                 dev->tx_queue_len = ifr->ifr_qlen;
4427                 return 0;
4428
4429         case SIOCSIFNAME:
4430                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4431                 return dev_change_name(dev, ifr->ifr_newname);
4432
4433         /*
4434          *      Unknown or private ioctl
4435          */
4436         default:
4437                 if ((cmd >= SIOCDEVPRIVATE &&
4438                     cmd <= SIOCDEVPRIVATE + 15) ||
4439                     cmd == SIOCBONDENSLAVE ||
4440                     cmd == SIOCBONDRELEASE ||
4441                     cmd == SIOCBONDSETHWADDR ||
4442                     cmd == SIOCBONDSLAVEINFOQUERY ||
4443                     cmd == SIOCBONDINFOQUERY ||
4444                     cmd == SIOCBONDCHANGEACTIVE ||
4445                     cmd == SIOCGMIIPHY ||
4446                     cmd == SIOCGMIIREG ||
4447                     cmd == SIOCSMIIREG ||
4448                     cmd == SIOCBRADDIF ||
4449                     cmd == SIOCBRDELIF ||
4450                     cmd == SIOCSHWTSTAMP ||
4451                     cmd == SIOCWANDEV) {
4452                         err = -EOPNOTSUPP;
4453                         if (ops->ndo_do_ioctl) {
4454                                 if (netif_device_present(dev))
4455                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4456                                 else
4457                                         err = -ENODEV;
4458                         }
4459                 } else
4460                         err = -EINVAL;
4461
4462         }
4463         return err;
4464 }
4465
4466 /*
4467  *      This function handles all "interface"-type I/O control requests. The actual
4468  *      'doing' part of this is dev_ifsioc above.
4469  */
4470
4471 /**
4472  *      dev_ioctl       -       network device ioctl
4473  *      @net: the applicable net namespace
4474  *      @cmd: command to issue
4475  *      @arg: pointer to a struct ifreq in user space
4476  *
4477  *      Issue ioctl functions to devices. This is normally called by the
4478  *      user space syscall interfaces but can sometimes be useful for
4479  *      other purposes. The return value is the return from the syscall if
4480  *      positive or a negative errno code on error.
4481  */
4482
4483 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4484 {
4485         struct ifreq ifr;
4486         int ret;
4487         char *colon;
4488
4489         /* One special case: SIOCGIFCONF takes ifconf argument
4490            and requires shared lock, because it sleeps writing
4491            to user space.
4492          */
4493
4494         if (cmd == SIOCGIFCONF) {
4495                 rtnl_lock();
4496                 ret = dev_ifconf(net, (char __user *) arg);
4497                 rtnl_unlock();
4498                 return ret;
4499         }
4500         if (cmd == SIOCGIFNAME)
4501                 return dev_ifname(net, (struct ifreq __user *)arg);
4502
4503         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4504                 return -EFAULT;
4505
4506         ifr.ifr_name[IFNAMSIZ-1] = 0;
4507
4508         colon = strchr(ifr.ifr_name, ':');
4509         if (colon)
4510                 *colon = 0;
4511
4512         /*
4513          *      See which interface the caller is talking about.
4514          */
4515
4516         switch (cmd) {
4517         /*
4518          *      These ioctl calls:
4519          *      - can be done by all.
4520          *      - atomic and do not require locking.
4521          *      - return a value
4522          */
4523         case SIOCGIFFLAGS:
4524         case SIOCGIFMETRIC:
4525         case SIOCGIFMTU:
4526         case SIOCGIFHWADDR:
4527         case SIOCGIFSLAVE:
4528         case SIOCGIFMAP:
4529         case SIOCGIFINDEX:
4530         case SIOCGIFTXQLEN:
4531                 dev_load(net, ifr.ifr_name);
4532                 read_lock(&dev_base_lock);
4533                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4534                 read_unlock(&dev_base_lock);
4535                 if (!ret) {
4536                         if (colon)
4537                                 *colon = ':';
4538                         if (copy_to_user(arg, &ifr,
4539                                          sizeof(struct ifreq)))
4540                                 ret = -EFAULT;
4541                 }
4542                 return ret;
4543
4544         case SIOCETHTOOL:
4545                 dev_load(net, ifr.ifr_name);
4546                 rtnl_lock();
4547                 ret = dev_ethtool(net, &ifr);
4548                 rtnl_unlock();
4549                 if (!ret) {
4550                         if (colon)
4551                                 *colon = ':';
4552                         if (copy_to_user(arg, &ifr,
4553                                          sizeof(struct ifreq)))
4554                                 ret = -EFAULT;
4555                 }
4556                 return ret;
4557
4558         /*
4559          *      These ioctl calls:
4560          *      - require superuser power.
4561          *      - require strict serialization.
4562          *      - return a value
4563          */
4564         case SIOCGMIIPHY:
4565         case SIOCGMIIREG:
4566         case SIOCSIFNAME:
4567                 if (!capable(CAP_NET_ADMIN))
4568                         return -EPERM;
4569                 dev_load(net, ifr.ifr_name);
4570                 rtnl_lock();
4571                 ret = dev_ifsioc(net, &ifr, cmd);
4572                 rtnl_unlock();
4573                 if (!ret) {
4574                         if (colon)
4575                                 *colon = ':';
4576                         if (copy_to_user(arg, &ifr,
4577                                          sizeof(struct ifreq)))
4578                                 ret = -EFAULT;
4579                 }
4580                 return ret;
4581
4582         /*
4583          *      These ioctl calls:
4584          *      - require superuser power.
4585          *      - require strict serialization.
4586          *      - do not return a value
4587          */
4588         case SIOCSIFFLAGS:
4589         case SIOCSIFMETRIC:
4590         case SIOCSIFMTU:
4591         case SIOCSIFMAP:
4592         case SIOCSIFHWADDR:
4593         case SIOCSIFSLAVE:
4594         case SIOCADDMULTI:
4595         case SIOCDELMULTI:
4596         case SIOCSIFHWBROADCAST:
4597         case SIOCSIFTXQLEN:
4598         case SIOCSMIIREG:
4599         case SIOCBONDENSLAVE:
4600         case SIOCBONDRELEASE:
4601         case SIOCBONDSETHWADDR:
4602         case SIOCBONDCHANGEACTIVE:
4603         case SIOCBRADDIF:
4604         case SIOCBRDELIF:
4605         case SIOCSHWTSTAMP:
4606                 if (!capable(CAP_NET_ADMIN))
4607                         return -EPERM;
4608                 /* fall through */
4609         case SIOCBONDSLAVEINFOQUERY:
4610         case SIOCBONDINFOQUERY:
4611                 dev_load(net, ifr.ifr_name);
4612                 rtnl_lock();
4613                 ret = dev_ifsioc(net, &ifr, cmd);
4614                 rtnl_unlock();
4615                 return ret;
4616
4617         case SIOCGIFMEM:
4618                 /* Get the per device memory space. We can add this but
4619                  * currently do not support it */
4620         case SIOCSIFMEM:
4621                 /* Set the per device memory buffer space.
4622                  * Not applicable in our case */
4623         case SIOCSIFLINK:
4624                 return -EINVAL;
4625
4626         /*
4627          *      Unknown or private ioctl.
4628          */
4629         default:
4630                 if (cmd == SIOCWANDEV ||
4631                     (cmd >= SIOCDEVPRIVATE &&
4632                      cmd <= SIOCDEVPRIVATE + 15)) {
4633                         dev_load(net, ifr.ifr_name);
4634                         rtnl_lock();
4635                         ret = dev_ifsioc(net, &ifr, cmd);
4636                         rtnl_unlock();
4637                         if (!ret && copy_to_user(arg, &ifr,
4638                                                  sizeof(struct ifreq)))
4639                                 ret = -EFAULT;
4640                         return ret;
4641                 }
4642                 /* Take care of Wireless Extensions */
4643                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4644                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4645                 return -EINVAL;
4646         }
4647 }
4648
4649
4650 /**
4651  *      dev_new_index   -       allocate an ifindex
4652  *      @net: the applicable net namespace
4653  *
4654  *      Returns a suitable unique value for a new device interface
4655  *      number.  The caller must hold the rtnl semaphore or the
4656  *      dev_base_lock to be sure it remains unique.
4657  */
4658 static int dev_new_index(struct net *net)
4659 {
4660         static int ifindex;
4661         for (;;) {
4662                 if (++ifindex <= 0)
4663                         ifindex = 1;
4664                 if (!__dev_get_by_index(net, ifindex))
4665                         return ifindex;
4666         }
4667 }
4668
4669 /* Delayed registration/unregisteration */
4670 static LIST_HEAD(net_todo_list);
4671
4672 static void net_set_todo(struct net_device *dev)
4673 {
4674         list_add_tail(&dev->todo_list, &net_todo_list);
4675 }
4676
4677 static void rollback_registered_many(struct list_head *head)
4678 {
4679         struct net_device *dev;
4680
4681         BUG_ON(dev_boot_phase);
4682         ASSERT_RTNL();
4683
4684         list_for_each_entry(dev, head, unreg_list) {
4685                 /* Some devices call without registering
4686                  * for initialization unwind.
4687                  */
4688                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4689                         pr_debug("unregister_netdevice: device %s/%p never "
4690                                  "was registered\n", dev->name, dev);
4691
4692                         WARN_ON(1);
4693                         return;
4694                 }
4695
4696                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4697
4698                 /* If device is running, close it first. */
4699                 dev_close(dev);
4700
4701                 /* And unlink it from device chain. */
4702                 unlist_netdevice(dev);
4703
4704                 dev->reg_state = NETREG_UNREGISTERING;
4705         }
4706
4707         synchronize_net();
4708
4709         list_for_each_entry(dev, head, unreg_list) {
4710                 /* Shutdown queueing discipline. */
4711                 dev_shutdown(dev);
4712
4713
4714                 /* Notify protocols, that we are about to destroy
4715                    this device. They should clean all the things.
4716                 */
4717                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4718
4719                 /*
4720                  *      Flush the unicast and multicast chains
4721                  */
4722                 dev_unicast_flush(dev);
4723                 dev_addr_discard(dev);
4724
4725                 if (dev->netdev_ops->ndo_uninit)
4726                         dev->netdev_ops->ndo_uninit(dev);
4727
4728                 /* Notifier chain MUST detach us from master device. */
4729                 WARN_ON(dev->master);
4730
4731                 /* Remove entries from kobject tree */
4732                 netdev_unregister_kobject(dev);
4733         }
4734
4735         synchronize_net();
4736
4737         list_for_each_entry(dev, head, unreg_list)
4738                 dev_put(dev);
4739 }
4740
4741 static void rollback_registered(struct net_device *dev)
4742 {
4743         LIST_HEAD(single);
4744
4745         list_add(&dev->unreg_list, &single);
4746         rollback_registered_many(&single);
4747 }
4748
4749 static void __netdev_init_queue_locks_one(struct net_device *dev,
4750                                           struct netdev_queue *dev_queue,
4751                                           void *_unused)
4752 {
4753         spin_lock_init(&dev_queue->_xmit_lock);
4754         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4755         dev_queue->xmit_lock_owner = -1;
4756 }
4757
4758 static void netdev_init_queue_locks(struct net_device *dev)
4759 {
4760         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4761         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4762 }
4763
4764 unsigned long netdev_fix_features(unsigned long features, const char *name)
4765 {
4766         /* Fix illegal SG+CSUM combinations. */
4767         if ((features & NETIF_F_SG) &&
4768             !(features & NETIF_F_ALL_CSUM)) {
4769                 if (name)
4770                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4771                                "checksum feature.\n", name);
4772                 features &= ~NETIF_F_SG;
4773         }
4774
4775         /* TSO requires that SG is present as well. */
4776         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4777                 if (name)
4778                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4779                                "SG feature.\n", name);
4780                 features &= ~NETIF_F_TSO;
4781         }
4782
4783         if (features & NETIF_F_UFO) {
4784                 if (!(features & NETIF_F_GEN_CSUM)) {
4785                         if (name)
4786                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4787                                        "since no NETIF_F_HW_CSUM feature.\n",
4788                                        name);
4789                         features &= ~NETIF_F_UFO;
4790                 }
4791
4792                 if (!(features & NETIF_F_SG)) {
4793                         if (name)
4794                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4795                                        "since no NETIF_F_SG feature.\n", name);
4796                         features &= ~NETIF_F_UFO;
4797                 }
4798         }
4799
4800         return features;
4801 }
4802 EXPORT_SYMBOL(netdev_fix_features);
4803
4804 /**
4805  *      register_netdevice      - register a network device
4806  *      @dev: device to register
4807  *
4808  *      Take a completed network device structure and add it to the kernel
4809  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4810  *      chain. 0 is returned on success. A negative errno code is returned
4811  *      on a failure to set up the device, or if the name is a duplicate.
4812  *
4813  *      Callers must hold the rtnl semaphore. You may want
4814  *      register_netdev() instead of this.
4815  *
4816  *      BUGS:
4817  *      The locking appears insufficient to guarantee two parallel registers
4818  *      will not get the same name.
4819  */
4820
4821 int register_netdevice(struct net_device *dev)
4822 {
4823         struct hlist_head *head;
4824         struct hlist_node *p;
4825         int ret;
4826         struct net *net = dev_net(dev);
4827
4828         BUG_ON(dev_boot_phase);
4829         ASSERT_RTNL();
4830
4831         might_sleep();
4832
4833         /* When net_device's are persistent, this will be fatal. */
4834         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4835         BUG_ON(!net);
4836
4837         spin_lock_init(&dev->addr_list_lock);
4838         netdev_set_addr_lockdep_class(dev);
4839         netdev_init_queue_locks(dev);
4840
4841         dev->iflink = -1;
4842
4843         /* Init, if this function is available */
4844         if (dev->netdev_ops->ndo_init) {
4845                 ret = dev->netdev_ops->ndo_init(dev);
4846                 if (ret) {
4847                         if (ret > 0)
4848                                 ret = -EIO;
4849                         goto out;
4850                 }
4851         }
4852
4853         if (!dev_valid_name(dev->name)) {
4854                 ret = -EINVAL;
4855                 goto err_uninit;
4856         }
4857
4858         dev->ifindex = dev_new_index(net);
4859         if (dev->iflink == -1)
4860                 dev->iflink = dev->ifindex;
4861
4862         /* Check for existence of name */
4863         head = dev_name_hash(net, dev->name);
4864         hlist_for_each(p, head) {
4865                 struct net_device *d
4866                         = hlist_entry(p, struct net_device, name_hlist);
4867                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4868                         ret = -EEXIST;
4869                         goto err_uninit;
4870                 }
4871         }
4872
4873         /* Fix illegal checksum combinations */
4874         if ((dev->features & NETIF_F_HW_CSUM) &&
4875             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4876                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4877                        dev->name);
4878                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4879         }
4880
4881         if ((dev->features & NETIF_F_NO_CSUM) &&
4882             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4883                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4884                        dev->name);
4885                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4886         }
4887
4888         dev->features = netdev_fix_features(dev->features, dev->name);
4889
4890         /* Enable software GSO if SG is supported. */
4891         if (dev->features & NETIF_F_SG)
4892                 dev->features |= NETIF_F_GSO;
4893
4894         netdev_initialize_kobject(dev);
4895
4896         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
4897         ret = notifier_to_errno(ret);
4898         if (ret)
4899                 goto err_uninit;
4900
4901         ret = netdev_register_kobject(dev);
4902         if (ret)
4903                 goto err_uninit;
4904         dev->reg_state = NETREG_REGISTERED;
4905
4906         /*
4907          *      Default initial state at registry is that the
4908          *      device is present.
4909          */
4910
4911         set_bit(__LINK_STATE_PRESENT, &dev->state);
4912
4913         dev_init_scheduler(dev);
4914         dev_hold(dev);
4915         list_netdevice(dev);
4916
4917         /* Notify protocols, that a new device appeared. */
4918         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4919         ret = notifier_to_errno(ret);
4920         if (ret) {
4921                 rollback_registered(dev);
4922                 dev->reg_state = NETREG_UNREGISTERED;
4923         }
4924
4925 out:
4926         return ret;
4927
4928 err_uninit:
4929         if (dev->netdev_ops->ndo_uninit)
4930                 dev->netdev_ops->ndo_uninit(dev);
4931         goto out;
4932 }
4933 EXPORT_SYMBOL(register_netdevice);
4934
4935 /**
4936  *      init_dummy_netdev       - init a dummy network device for NAPI
4937  *      @dev: device to init
4938  *
4939  *      This takes a network device structure and initialize the minimum
4940  *      amount of fields so it can be used to schedule NAPI polls without
4941  *      registering a full blown interface. This is to be used by drivers
4942  *      that need to tie several hardware interfaces to a single NAPI
4943  *      poll scheduler due to HW limitations.
4944  */
4945 int init_dummy_netdev(struct net_device *dev)
4946 {
4947         /* Clear everything. Note we don't initialize spinlocks
4948          * are they aren't supposed to be taken by any of the
4949          * NAPI code and this dummy netdev is supposed to be
4950          * only ever used for NAPI polls
4951          */
4952         memset(dev, 0, sizeof(struct net_device));
4953
4954         /* make sure we BUG if trying to hit standard
4955          * register/unregister code path
4956          */
4957         dev->reg_state = NETREG_DUMMY;
4958
4959         /* initialize the ref count */
4960         atomic_set(&dev->refcnt, 1);
4961
4962         /* NAPI wants this */
4963         INIT_LIST_HEAD(&dev->napi_list);
4964
4965         /* a dummy interface is started by default */
4966         set_bit(__LINK_STATE_PRESENT, &dev->state);
4967         set_bit(__LINK_STATE_START, &dev->state);
4968
4969         return 0;
4970 }
4971 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4972
4973
4974 /**
4975  *      register_netdev - register a network device
4976  *      @dev: device to register
4977  *
4978  *      Take a completed network device structure and add it to the kernel
4979  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4980  *      chain. 0 is returned on success. A negative errno code is returned
4981  *      on a failure to set up the device, or if the name is a duplicate.
4982  *
4983  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4984  *      and expands the device name if you passed a format string to
4985  *      alloc_netdev.
4986  */
4987 int register_netdev(struct net_device *dev)
4988 {
4989         int err;
4990
4991         rtnl_lock();
4992
4993         /*
4994          * If the name is a format string the caller wants us to do a
4995          * name allocation.
4996          */
4997         if (strchr(dev->name, '%')) {
4998                 err = dev_alloc_name(dev, dev->name);
4999                 if (err < 0)
5000                         goto out;
5001         }
5002
5003         err = register_netdevice(dev);
5004 out:
5005         rtnl_unlock();
5006         return err;
5007 }
5008 EXPORT_SYMBOL(register_netdev);
5009
5010 /*
5011  * netdev_wait_allrefs - wait until all references are gone.
5012  *
5013  * This is called when unregistering network devices.
5014  *
5015  * Any protocol or device that holds a reference should register
5016  * for netdevice notification, and cleanup and put back the
5017  * reference if they receive an UNREGISTER event.
5018  * We can get stuck here if buggy protocols don't correctly
5019  * call dev_put.
5020  */
5021 static void netdev_wait_allrefs(struct net_device *dev)
5022 {
5023         unsigned long rebroadcast_time, warning_time;
5024
5025         rebroadcast_time = warning_time = jiffies;
5026         while (atomic_read(&dev->refcnt) != 0) {
5027                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5028                         rtnl_lock();
5029
5030                         /* Rebroadcast unregister notification */
5031                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5032
5033                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5034                                      &dev->state)) {
5035                                 /* We must not have linkwatch events
5036                                  * pending on unregister. If this
5037                                  * happens, we simply run the queue
5038                                  * unscheduled, resulting in a noop
5039                                  * for this device.
5040                                  */
5041                                 linkwatch_run_queue();
5042                         }
5043
5044                         __rtnl_unlock();
5045
5046                         rebroadcast_time = jiffies;
5047                 }
5048
5049                 msleep(250);
5050
5051                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5052                         printk(KERN_EMERG "unregister_netdevice: "
5053                                "waiting for %s to become free. Usage "
5054                                "count = %d\n",
5055                                dev->name, atomic_read(&dev->refcnt));
5056                         warning_time = jiffies;
5057                 }
5058         }
5059 }
5060
5061 /* The sequence is:
5062  *
5063  *      rtnl_lock();
5064  *      ...
5065  *      register_netdevice(x1);
5066  *      register_netdevice(x2);
5067  *      ...
5068  *      unregister_netdevice(y1);
5069  *      unregister_netdevice(y2);
5070  *      ...
5071  *      rtnl_unlock();
5072  *      free_netdev(y1);
5073  *      free_netdev(y2);
5074  *
5075  * We are invoked by rtnl_unlock().
5076  * This allows us to deal with problems:
5077  * 1) We can delete sysfs objects which invoke hotplug
5078  *    without deadlocking with linkwatch via keventd.
5079  * 2) Since we run with the RTNL semaphore not held, we can sleep
5080  *    safely in order to wait for the netdev refcnt to drop to zero.
5081  *
5082  * We must not return until all unregister events added during
5083  * the interval the lock was held have been completed.
5084  */
5085 void netdev_run_todo(void)
5086 {
5087         struct list_head list;
5088
5089         /* Snapshot list, allow later requests */
5090         list_replace_init(&net_todo_list, &list);
5091
5092         __rtnl_unlock();
5093
5094         while (!list_empty(&list)) {
5095                 struct net_device *dev
5096                         = list_entry(list.next, struct net_device, todo_list);
5097                 list_del(&dev->todo_list);
5098
5099                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5100                         printk(KERN_ERR "network todo '%s' but state %d\n",
5101                                dev->name, dev->reg_state);
5102                         dump_stack();
5103                         continue;
5104                 }
5105
5106                 dev->reg_state = NETREG_UNREGISTERED;
5107
5108                 on_each_cpu(flush_backlog, dev, 1);
5109
5110                 netdev_wait_allrefs(dev);
5111
5112                 /* paranoia */
5113                 BUG_ON(atomic_read(&dev->refcnt));
5114                 WARN_ON(dev->ip_ptr);
5115                 WARN_ON(dev->ip6_ptr);
5116                 WARN_ON(dev->dn_ptr);
5117
5118                 if (dev->destructor)
5119                         dev->destructor(dev);
5120
5121                 /* Free network device */
5122                 kobject_put(&dev->dev.kobj);
5123         }
5124 }
5125
5126 /**
5127  *      dev_get_stats   - get network device statistics
5128  *      @dev: device to get statistics from
5129  *
5130  *      Get network statistics from device. The device driver may provide
5131  *      its own method by setting dev->netdev_ops->get_stats; otherwise
5132  *      the internal statistics structure is used.
5133  */
5134 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5135 {
5136         const struct net_device_ops *ops = dev->netdev_ops;
5137
5138         if (ops->ndo_get_stats)
5139                 return ops->ndo_get_stats(dev);
5140         else {
5141                 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5142                 struct net_device_stats *stats = &dev->stats;
5143                 unsigned int i;
5144                 struct netdev_queue *txq;
5145
5146                 for (i = 0; i < dev->num_tx_queues; i++) {
5147                         txq = netdev_get_tx_queue(dev, i);
5148                         tx_bytes   += txq->tx_bytes;
5149                         tx_packets += txq->tx_packets;
5150                         tx_dropped += txq->tx_dropped;
5151                 }
5152                 if (tx_bytes || tx_packets || tx_dropped) {
5153                         stats->tx_bytes   = tx_bytes;
5154                         stats->tx_packets = tx_packets;
5155                         stats->tx_dropped = tx_dropped;
5156                 }
5157                 return stats;
5158         }
5159 }
5160 EXPORT_SYMBOL(dev_get_stats);
5161
5162 static void netdev_init_one_queue(struct net_device *dev,
5163                                   struct netdev_queue *queue,
5164                                   void *_unused)
5165 {
5166         queue->dev = dev;
5167 }
5168
5169 static void netdev_init_queues(struct net_device *dev)
5170 {
5171         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5172         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5173         spin_lock_init(&dev->tx_global_lock);
5174 }
5175
5176 /**
5177  *      alloc_netdev_mq - allocate network device
5178  *      @sizeof_priv:   size of private data to allocate space for
5179  *      @name:          device name format string
5180  *      @setup:         callback to initialize device
5181  *      @queue_count:   the number of subqueues to allocate
5182  *
5183  *      Allocates a struct net_device with private data area for driver use
5184  *      and performs basic initialization.  Also allocates subquue structs
5185  *      for each queue on the device at the end of the netdevice.
5186  */
5187 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5188                 void (*setup)(struct net_device *), unsigned int queue_count)
5189 {
5190         struct netdev_queue *tx;
5191         struct net_device *dev;
5192         size_t alloc_size;
5193         struct net_device *p;
5194
5195         BUG_ON(strlen(name) >= sizeof(dev->name));
5196
5197         alloc_size = sizeof(struct net_device);
5198         if (sizeof_priv) {
5199                 /* ensure 32-byte alignment of private area */
5200                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5201                 alloc_size += sizeof_priv;
5202         }
5203         /* ensure 32-byte alignment of whole construct */
5204         alloc_size += NETDEV_ALIGN - 1;
5205
5206         p = kzalloc(alloc_size, GFP_KERNEL);
5207         if (!p) {
5208                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5209                 return NULL;
5210         }
5211
5212         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5213         if (!tx) {
5214                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5215                        "tx qdiscs.\n");
5216                 goto free_p;
5217         }
5218
5219         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5220         dev->padded = (char *)dev - (char *)p;
5221
5222         if (dev_addr_init(dev))
5223                 goto free_tx;
5224
5225         dev_unicast_init(dev);
5226
5227         dev_net_set(dev, &init_net);
5228
5229         dev->_tx = tx;
5230         dev->num_tx_queues = queue_count;
5231         dev->real_num_tx_queues = queue_count;
5232
5233         dev->gso_max_size = GSO_MAX_SIZE;
5234
5235         netdev_init_queues(dev);
5236
5237         INIT_LIST_HEAD(&dev->napi_list);
5238         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5239         setup(dev);
5240         strcpy(dev->name, name);
5241         return dev;
5242
5243 free_tx:
5244         kfree(tx);
5245
5246 free_p:
5247         kfree(p);
5248         return NULL;
5249 }
5250 EXPORT_SYMBOL(alloc_netdev_mq);
5251
5252 /**
5253  *      free_netdev - free network device
5254  *      @dev: device
5255  *
5256  *      This function does the last stage of destroying an allocated device
5257  *      interface. The reference to the device object is released.
5258  *      If this is the last reference then it will be freed.
5259  */
5260 void free_netdev(struct net_device *dev)
5261 {
5262         struct napi_struct *p, *n;
5263
5264         release_net(dev_net(dev));
5265
5266         kfree(dev->_tx);
5267
5268         /* Flush device addresses */
5269         dev_addr_flush(dev);
5270
5271         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5272                 netif_napi_del(p);
5273
5274         /*  Compatibility with error handling in drivers */
5275         if (dev->reg_state == NETREG_UNINITIALIZED) {
5276                 kfree((char *)dev - dev->padded);
5277                 return;
5278         }
5279
5280         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5281         dev->reg_state = NETREG_RELEASED;
5282
5283         /* will free via device release */
5284         put_device(&dev->dev);
5285 }
5286 EXPORT_SYMBOL(free_netdev);
5287
5288 /**
5289  *      synchronize_net -  Synchronize with packet receive processing
5290  *
5291  *      Wait for packets currently being received to be done.
5292  *      Does not block later packets from starting.
5293  */
5294 void synchronize_net(void)
5295 {
5296         might_sleep();
5297         synchronize_rcu();
5298 }
5299 EXPORT_SYMBOL(synchronize_net);
5300
5301 /**
5302  *      unregister_netdevice_queue - remove device from the kernel
5303  *      @dev: device
5304  *      @head: list
5305
5306  *      This function shuts down a device interface and removes it
5307  *      from the kernel tables.
5308  *      If head not NULL, device is queued to be unregistered later.
5309  *
5310  *      Callers must hold the rtnl semaphore.  You may want
5311  *      unregister_netdev() instead of this.
5312  */
5313
5314 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5315 {
5316         ASSERT_RTNL();
5317
5318         if (head) {
5319                 list_add_tail(&dev->unreg_list, head);
5320         } else {
5321                 rollback_registered(dev);
5322                 /* Finish processing unregister after unlock */
5323                 net_set_todo(dev);
5324         }
5325 }
5326 EXPORT_SYMBOL(unregister_netdevice_queue);
5327
5328 /**
5329  *      unregister_netdevice_many - unregister many devices
5330  *      @head: list of devices
5331  *
5332  */
5333 void unregister_netdevice_many(struct list_head *head)
5334 {
5335         struct net_device *dev;
5336
5337         if (!list_empty(head)) {
5338                 rollback_registered_many(head);
5339                 list_for_each_entry(dev, head, unreg_list)
5340                         net_set_todo(dev);
5341         }
5342 }
5343 EXPORT_SYMBOL(unregister_netdevice_many);
5344
5345 /**
5346  *      unregister_netdev - remove device from the kernel
5347  *      @dev: device
5348  *
5349  *      This function shuts down a device interface and removes it
5350  *      from the kernel tables.
5351  *
5352  *      This is just a wrapper for unregister_netdevice that takes
5353  *      the rtnl semaphore.  In general you want to use this and not
5354  *      unregister_netdevice.
5355  */
5356 void unregister_netdev(struct net_device *dev)
5357 {
5358         rtnl_lock();
5359         unregister_netdevice(dev);
5360         rtnl_unlock();
5361 }
5362 EXPORT_SYMBOL(unregister_netdev);
5363
5364 /**
5365  *      dev_change_net_namespace - move device to different nethost namespace
5366  *      @dev: device
5367  *      @net: network namespace
5368  *      @pat: If not NULL name pattern to try if the current device name
5369  *            is already taken in the destination network namespace.
5370  *
5371  *      This function shuts down a device interface and moves it
5372  *      to a new network namespace. On success 0 is returned, on
5373  *      a failure a netagive errno code is returned.
5374  *
5375  *      Callers must hold the rtnl semaphore.
5376  */
5377
5378 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5379 {
5380         char buf[IFNAMSIZ];
5381         const char *destname;
5382         int err;
5383
5384         ASSERT_RTNL();
5385
5386         /* Don't allow namespace local devices to be moved. */
5387         err = -EINVAL;
5388         if (dev->features & NETIF_F_NETNS_LOCAL)
5389                 goto out;
5390
5391 #ifdef CONFIG_SYSFS
5392         /* Don't allow real devices to be moved when sysfs
5393          * is enabled.
5394          */
5395         err = -EINVAL;
5396         if (dev->dev.parent)
5397                 goto out;
5398 #endif
5399
5400         /* Ensure the device has been registrered */
5401         err = -EINVAL;
5402         if (dev->reg_state != NETREG_REGISTERED)
5403                 goto out;
5404
5405         /* Get out if there is nothing todo */
5406         err = 0;
5407         if (net_eq(dev_net(dev), net))
5408                 goto out;
5409
5410         /* Pick the destination device name, and ensure
5411          * we can use it in the destination network namespace.
5412          */
5413         err = -EEXIST;
5414         destname = dev->name;
5415         if (__dev_get_by_name(net, destname)) {
5416                 /* We get here if we can't use the current device name */
5417                 if (!pat)
5418                         goto out;
5419                 if (!dev_valid_name(pat))
5420                         goto out;
5421                 if (strchr(pat, '%')) {
5422                         if (__dev_alloc_name(net, pat, buf) < 0)
5423                                 goto out;
5424                         destname = buf;
5425                 } else
5426                         destname = pat;
5427                 if (__dev_get_by_name(net, destname))
5428                         goto out;
5429         }
5430
5431         /*
5432          * And now a mini version of register_netdevice unregister_netdevice.
5433          */
5434
5435         /* If device is running close it first. */
5436         dev_close(dev);
5437
5438         /* And unlink it from device chain */
5439         err = -ENODEV;
5440         unlist_netdevice(dev);
5441
5442         synchronize_net();
5443
5444         /* Shutdown queueing discipline. */
5445         dev_shutdown(dev);
5446
5447         /* Notify protocols, that we are about to destroy
5448            this device. They should clean all the things.
5449         */
5450         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5451
5452         /*
5453          *      Flush the unicast and multicast chains
5454          */
5455         dev_unicast_flush(dev);
5456         dev_addr_discard(dev);
5457
5458         netdev_unregister_kobject(dev);
5459
5460         /* Actually switch the network namespace */
5461         dev_net_set(dev, net);
5462
5463         /* Assign the new device name */
5464         if (destname != dev->name)
5465                 strcpy(dev->name, destname);
5466
5467         /* If there is an ifindex conflict assign a new one */
5468         if (__dev_get_by_index(net, dev->ifindex)) {
5469                 int iflink = (dev->iflink == dev->ifindex);
5470                 dev->ifindex = dev_new_index(net);
5471                 if (iflink)
5472                         dev->iflink = dev->ifindex;
5473         }
5474
5475         /* Fixup kobjects */
5476         err = netdev_register_kobject(dev);
5477         WARN_ON(err);
5478
5479         /* Add the device back in the hashes */
5480         list_netdevice(dev);
5481
5482         /* Notify protocols, that a new device appeared. */
5483         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5484
5485         synchronize_net();
5486         err = 0;
5487 out:
5488         return err;
5489 }
5490 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5491
5492 static int dev_cpu_callback(struct notifier_block *nfb,
5493                             unsigned long action,
5494                             void *ocpu)
5495 {
5496         struct sk_buff **list_skb;
5497         struct Qdisc **list_net;
5498         struct sk_buff *skb;
5499         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5500         struct softnet_data *sd, *oldsd;
5501
5502         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5503                 return NOTIFY_OK;
5504
5505         local_irq_disable();
5506         cpu = smp_processor_id();
5507         sd = &per_cpu(softnet_data, cpu);
5508         oldsd = &per_cpu(softnet_data, oldcpu);
5509
5510         /* Find end of our completion_queue. */
5511         list_skb = &sd->completion_queue;
5512         while (*list_skb)
5513                 list_skb = &(*list_skb)->next;
5514         /* Append completion queue from offline CPU. */
5515         *list_skb = oldsd->completion_queue;
5516         oldsd->completion_queue = NULL;
5517
5518         /* Find end of our output_queue. */
5519         list_net = &sd->output_queue;
5520         while (*list_net)
5521                 list_net = &(*list_net)->next_sched;
5522         /* Append output queue from offline CPU. */
5523         *list_net = oldsd->output_queue;
5524         oldsd->output_queue = NULL;
5525
5526         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5527         local_irq_enable();
5528
5529         /* Process offline CPU's input_pkt_queue */
5530         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5531                 netif_rx(skb);
5532
5533         return NOTIFY_OK;
5534 }
5535
5536
5537 /**
5538  *      netdev_increment_features - increment feature set by one
5539  *      @all: current feature set
5540  *      @one: new feature set
5541  *      @mask: mask feature set
5542  *
5543  *      Computes a new feature set after adding a device with feature set
5544  *      @one to the master device with current feature set @all.  Will not
5545  *      enable anything that is off in @mask. Returns the new feature set.
5546  */
5547 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5548                                         unsigned long mask)
5549 {
5550         /* If device needs checksumming, downgrade to it. */
5551         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5552                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5553         else if (mask & NETIF_F_ALL_CSUM) {
5554                 /* If one device supports v4/v6 checksumming, set for all. */
5555                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5556                     !(all & NETIF_F_GEN_CSUM)) {
5557                         all &= ~NETIF_F_ALL_CSUM;
5558                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5559                 }
5560
5561                 /* If one device supports hw checksumming, set for all. */
5562                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5563                         all &= ~NETIF_F_ALL_CSUM;
5564                         all |= NETIF_F_HW_CSUM;
5565                 }
5566         }
5567
5568         one |= NETIF_F_ALL_CSUM;
5569
5570         one |= all & NETIF_F_ONE_FOR_ALL;
5571         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5572         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5573
5574         return all;
5575 }
5576 EXPORT_SYMBOL(netdev_increment_features);
5577
5578 static struct hlist_head *netdev_create_hash(void)
5579 {
5580         int i;
5581         struct hlist_head *hash;
5582
5583         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5584         if (hash != NULL)
5585                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5586                         INIT_HLIST_HEAD(&hash[i]);
5587
5588         return hash;
5589 }
5590
5591 /* Initialize per network namespace state */
5592 static int __net_init netdev_init(struct net *net)
5593 {
5594         INIT_LIST_HEAD(&net->dev_base_head);
5595
5596         net->dev_name_head = netdev_create_hash();
5597         if (net->dev_name_head == NULL)
5598                 goto err_name;
5599
5600         net->dev_index_head = netdev_create_hash();
5601         if (net->dev_index_head == NULL)
5602                 goto err_idx;
5603
5604         return 0;
5605
5606 err_idx:
5607         kfree(net->dev_name_head);
5608 err_name:
5609         return -ENOMEM;
5610 }
5611
5612 /**
5613  *      netdev_drivername - network driver for the device
5614  *      @dev: network device
5615  *      @buffer: buffer for resulting name
5616  *      @len: size of buffer
5617  *
5618  *      Determine network driver for device.
5619  */
5620 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5621 {
5622         const struct device_driver *driver;
5623         const struct device *parent;
5624
5625         if (len <= 0 || !buffer)
5626                 return buffer;
5627         buffer[0] = 0;
5628
5629         parent = dev->dev.parent;
5630
5631         if (!parent)
5632                 return buffer;
5633
5634         driver = parent->driver;
5635         if (driver && driver->name)
5636                 strlcpy(buffer, driver->name, len);
5637         return buffer;
5638 }
5639
5640 static void __net_exit netdev_exit(struct net *net)
5641 {
5642         kfree(net->dev_name_head);
5643         kfree(net->dev_index_head);
5644 }
5645
5646 static struct pernet_operations __net_initdata netdev_net_ops = {
5647         .init = netdev_init,
5648         .exit = netdev_exit,
5649 };
5650
5651 static void __net_exit default_device_exit(struct net *net)
5652 {
5653         struct net_device *dev;
5654         /*
5655          * Push all migratable of the network devices back to the
5656          * initial network namespace
5657          */
5658         rtnl_lock();
5659 restart:
5660         for_each_netdev(net, dev) {
5661                 int err;
5662                 char fb_name[IFNAMSIZ];
5663
5664                 /* Ignore unmoveable devices (i.e. loopback) */
5665                 if (dev->features & NETIF_F_NETNS_LOCAL)
5666                         continue;
5667
5668                 /* Delete virtual devices */
5669                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5670                         dev->rtnl_link_ops->dellink(dev, NULL);
5671                         goto restart;
5672                 }
5673
5674                 /* Push remaing network devices to init_net */
5675                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5676                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5677                 if (err) {
5678                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5679                                 __func__, dev->name, err);
5680                         BUG();
5681                 }
5682                 goto restart;
5683         }
5684         rtnl_unlock();
5685 }
5686
5687 static struct pernet_operations __net_initdata default_device_ops = {
5688         .exit = default_device_exit,
5689 };
5690
5691 /*
5692  *      Initialize the DEV module. At boot time this walks the device list and
5693  *      unhooks any devices that fail to initialise (normally hardware not
5694  *      present) and leaves us with a valid list of present and active devices.
5695  *
5696  */
5697
5698 /*
5699  *       This is called single threaded during boot, so no need
5700  *       to take the rtnl semaphore.
5701  */
5702 static int __init net_dev_init(void)
5703 {
5704         int i, rc = -ENOMEM;
5705
5706         BUG_ON(!dev_boot_phase);
5707
5708         if (dev_proc_init())
5709                 goto out;
5710
5711         if (netdev_kobject_init())
5712                 goto out;
5713
5714         INIT_LIST_HEAD(&ptype_all);
5715         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5716                 INIT_LIST_HEAD(&ptype_base[i]);
5717
5718         if (register_pernet_subsys(&netdev_net_ops))
5719                 goto out;
5720
5721         /*
5722          *      Initialise the packet receive queues.
5723          */
5724
5725         for_each_possible_cpu(i) {
5726                 struct softnet_data *queue;
5727
5728                 queue = &per_cpu(softnet_data, i);
5729                 skb_queue_head_init(&queue->input_pkt_queue);
5730                 queue->completion_queue = NULL;
5731                 INIT_LIST_HEAD(&queue->poll_list);
5732
5733                 queue->backlog.poll = process_backlog;
5734                 queue->backlog.weight = weight_p;
5735                 queue->backlog.gro_list = NULL;
5736                 queue->backlog.gro_count = 0;
5737         }
5738
5739         dev_boot_phase = 0;
5740
5741         /* The loopback device is special if any other network devices
5742          * is present in a network namespace the loopback device must
5743          * be present. Since we now dynamically allocate and free the
5744          * loopback device ensure this invariant is maintained by
5745          * keeping the loopback device as the first device on the
5746          * list of network devices.  Ensuring the loopback devices
5747          * is the first device that appears and the last network device
5748          * that disappears.
5749          */
5750         if (register_pernet_device(&loopback_net_ops))
5751                 goto out;
5752
5753         if (register_pernet_device(&default_device_ops))
5754                 goto out;
5755
5756         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5757         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5758
5759         hotcpu_notifier(dev_cpu_callback, 0);
5760         dst_init();
5761         dev_mcast_init();
5762         rc = 0;
5763 out:
5764         return rc;
5765 }
5766
5767 subsys_initcall(net_dev_init);
5768
5769 static int __init initialize_hashrnd(void)
5770 {
5771         get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5772         return 0;
5773 }
5774
5775 late_initcall_sync(initialize_hashrnd);
5776