net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <linux/if_bridge.h>
 104 #include <linux/if_macvlan.h>
 105 #include <net/dst.h>
 106 #include <net/pkt_sched.h>
 107 #include <net/checksum.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131
 132 #include "net-sysfs.h"
 133
 134 /* Instead of increasing this, you should create a hash table. */
 135 #define MAX_GRO_SKBS 8
 136
 137 /* This should be increased if a protocol with a bigger head is added. */
 138 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 139
 140 /*
 141  *      The list of packet types we will receive (as opposed to discard)
 142  *      and the routines to invoke.
 143  *
 144  *      Why 16. Because with 16 the only overlap we get on a hash of the
 145  *      low nibble of the protocol value is RARP/SNAP/X.25.
 146  *
 147  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 148  *             sure which should go first, but I bet it won't make much
 149  *             difference if we are running VLANs.  The good news is that
 150  *             this protocol won't be in the list unless compiled in, so
 151  *             the average user (w/out VLANs) will not be adversely affected.
 152  *             --BLG
 153  *
 154  *              0800    IP
 155  *              8100    802.1Q VLAN
 156  *              0001    802.3
 157  *              0002    AX.25
 158  *              0004    802.2
 159  *              8035    RARP
 160  *              0005    SNAP
 161  *              0805    X.25
 162  *              0806    ARP
 163  *              8137    IPX
 164  *              0009    Localtalk
 165  *              86DD    IPv6
 166  */
 167
 168 #define PTYPE_HASH_SIZE (16)
 169 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 170
 171 static DEFINE_SPINLOCK(ptype_lock);
 172 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 173 static struct list_head ptype_all __read_mostly;        /* Taps */
 174
 175 /*
 176  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 177  * semaphore.
 178  *
 179  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 180  *
 181  * Writers must hold the rtnl semaphore while they loop through the
 182  * dev_base_head list, and hold dev_base_lock for writing when they do the
 183  * actual updates.  This allows pure readers to access the list even
 184  * while a writer is preparing to update it.
 185  *
 186  * To put it another way, dev_base_lock is held for writing only to
 187  * protect against pure readers; the rtnl semaphore provides the
 188  * protection against other writers.
 189  *
 190  * See, for example usages, register_netdevice() and
 191  * unregister_netdevice(), which must be called with the rtnl
 192  * semaphore held.
 193  */
 194 DEFINE_RWLOCK(dev_base_lock);
 195 EXPORT_SYMBOL(dev_base_lock);
 196
 197 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 198 {
 199         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 201 }
 202
 203 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 204 {
 205         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 206 }
 207
 208 /* Device list insertion */
 209 static int list_netdevice(struct net_device *dev)
 210 {
 211         struct net *net = dev_net(dev);
 212
 213         ASSERT_RTNL();
 214
 215         write_lock_bh(&dev_base_lock);
 216         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 217         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 218         hlist_add_head_rcu(&dev->index_hlist,
 219                            dev_index_hash(net, dev->ifindex));
 220         write_unlock_bh(&dev_base_lock);
 221         return 0;
 222 }
 223
 224 /* Device list removal
 225  * caller must respect a RCU grace period before freeing/reusing dev
 226  */
 227 static void unlist_netdevice(struct net_device *dev)
 228 {
 229         ASSERT_RTNL();
 230
 231         /* Unlink dev from the device chain */
 232         write_lock_bh(&dev_base_lock);
 233         list_del_rcu(&dev->dev_list);
 234         hlist_del_rcu(&dev->name_hlist);
 235         hlist_del_rcu(&dev->index_hlist);
 236         write_unlock_bh(&dev_base_lock);
 237 }
 238
 239 /*
 240  *      Our notifier list
 241  */
 242
 243 static RAW_NOTIFIER_HEAD(netdev_chain);
 244
 245 /*
 246  *      Device drivers call our routines to queue packets here. We empty the
 247  *      queue in the local softnet handler.
 248  */
 249
 250 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 251 EXPORT_PER_CPU_SYMBOL(softnet_data);
 252
 253 #ifdef CONFIG_LOCKDEP
 254 /*
 255  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 256  * according to dev->type
 257  */
 258 static const unsigned short netdev_lock_type[] =
 259         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 260          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 261          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 262          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 263          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 264          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 265          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 266          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 267          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 268          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 269          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 270          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 271          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 272          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 273          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 274          ARPHRD_VOID, ARPHRD_NONE};
 275
 276 static const char *const netdev_lock_name[] =
 277         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 278          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 279          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 280          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 281          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 282          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 283          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 284          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 285          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 286          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 287          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 288          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 289          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 290          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 291          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 292          "_xmit_VOID", "_xmit_NONE"};
 293
 294 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 295 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 296
 297 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 298 {
 299         int i;
 300
 301         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 302                 if (netdev_lock_type[i] == dev_type)
 303                         return i;
 304         /* the last key is used by default */
 305         return ARRAY_SIZE(netdev_lock_type) - 1;
 306 }
 307
 308 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 309                                                  unsigned short dev_type)
 310 {
 311         int i;
 312
 313         i = netdev_lock_pos(dev_type);
 314         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 315                                    netdev_lock_name[i]);
 316 }
 317
 318 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 319 {
 320         int i;
 321
 322         i = netdev_lock_pos(dev->type);
 323         lockdep_set_class_and_name(&dev->addr_list_lock,
 324                                    &netdev_addr_lock_key[i],
 325                                    netdev_lock_name[i]);
 326 }
 327 #else
 328 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 329                                                  unsigned short dev_type)
 330 {
 331 }
 332 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 333 {
 334 }
 335 #endif
 336
 337 /*******************************************************************************
 338
 339                 Protocol management and registration routines
 340
 341 *******************************************************************************/
 342
 343 /*
 344  *      Add a protocol ID to the list. Now that the input handler is
 345  *      smarter we can dispense with all the messy stuff that used to be
 346  *      here.
 347  *
 348  *      BEWARE!!! Protocol handlers, mangling input packets,
 349  *      MUST BE last in hash buckets and checking protocol handlers
 350  *      MUST start from promiscuous ptype_all chain in net_bh.
 351  *      It is true now, do not change it.
 352  *      Explanation follows: if protocol handler, mangling packet, will
 353  *      be the first on list, it is not able to sense, that packet
 354  *      is cloned and should be copied-on-write, so that it will
 355  *      change it and subsequent readers will get broken packet.
 356  *                                                      --ANK (980803)
 357  */
 358
 359 /**
 360  *      dev_add_pack - add packet handler
 361  *      @pt: packet type declaration
 362  *
 363  *      Add a protocol handler to the networking stack. The passed &packet_type
 364  *      is linked into kernel lists and may not be freed until it has been
 365  *      removed from the kernel lists.
 366  *
 367  *      This call does not sleep therefore it can not
 368  *      guarantee all CPU's that are in middle of receiving packets
 369  *      will see the new packet type (until the next received packet).
 370  */
 371
 372 void dev_add_pack(struct packet_type *pt)
 373 {
 374         int hash;
 375
 376         spin_lock_bh(&ptype_lock);
 377         if (pt->type == htons(ETH_P_ALL))
 378                 list_add_rcu(&pt->list, &ptype_all);
 379         else {
 380                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 381                 list_add_rcu(&pt->list, &ptype_base[hash]);
 382         }
 383         spin_unlock_bh(&ptype_lock);
 384 }
 385 EXPORT_SYMBOL(dev_add_pack);
 386
 387 /**
 388  *      __dev_remove_pack        - remove packet handler
 389  *      @pt: packet type declaration
 390  *
 391  *      Remove a protocol handler that was previously added to the kernel
 392  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 393  *      from the kernel lists and can be freed or reused once this function
 394  *      returns.
 395  *
 396  *      The packet type might still be in use by receivers
 397  *      and must not be freed until after all the CPU's have gone
 398  *      through a quiescent state.
 399  */
 400 void __dev_remove_pack(struct packet_type *pt)
 401 {
 402         struct list_head *head;
 403         struct packet_type *pt1;
 404
 405         spin_lock_bh(&ptype_lock);
 406
 407         if (pt->type == htons(ETH_P_ALL))
 408                 head = &ptype_all;
 409         else
 410                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 411
 412         list_for_each_entry(pt1, head, list) {
 413                 if (pt == pt1) {
 414                         list_del_rcu(&pt->list);
 415                         goto out;
 416                 }
 417         }
 418
 419         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 420 out:
 421         spin_unlock_bh(&ptype_lock);
 422 }
 423 EXPORT_SYMBOL(__dev_remove_pack);
 424
 425 /**
 426  *      dev_remove_pack  - remove packet handler
 427  *      @pt: packet type declaration
 428  *
 429  *      Remove a protocol handler that was previously added to the kernel
 430  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 431  *      from the kernel lists and can be freed or reused once this function
 432  *      returns.
 433  *
 434  *      This call sleeps to guarantee that no CPU is looking at the packet
 435  *      type after return.
 436  */
 437 void dev_remove_pack(struct packet_type *pt)
 438 {
 439         __dev_remove_pack(pt);
 440
 441         synchronize_net();
 442 }
 443 EXPORT_SYMBOL(dev_remove_pack);
 444
 445 /******************************************************************************
 446
 447                       Device Boot-time Settings Routines
 448
 449 *******************************************************************************/
 450
 451 /* Boot time configuration table */
 452 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 453
 454 /**
 455  *      netdev_boot_setup_add   - add new setup entry
 456  *      @name: name of the device
 457  *      @map: configured settings for the device
 458  *
 459  *      Adds new setup entry to the dev_boot_setup list.  The function
 460  *      returns 0 on error and 1 on success.  This is a generic routine to
 461  *      all netdevices.
 462  */
 463 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 464 {
 465         struct netdev_boot_setup *s;
 466         int i;
 467
 468         s = dev_boot_setup;
 469         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 470                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 471                         memset(s[i].name, 0, sizeof(s[i].name));
 472                         strlcpy(s[i].name, name, IFNAMSIZ);
 473                         memcpy(&s[i].map, map, sizeof(s[i].map));
 474                         break;
 475                 }
 476         }
 477
 478         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 479 }
 480
 481 /**
 482  *      netdev_boot_setup_check - check boot time settings
 483  *      @dev: the netdevice
 484  *
 485  *      Check boot time settings for the device.
 486  *      The found settings are set for the device to be used
 487  *      later in the device probing.
 488  *      Returns 0 if no settings found, 1 if they are.
 489  */
 490 int netdev_boot_setup_check(struct net_device *dev)
 491 {
 492         struct netdev_boot_setup *s = dev_boot_setup;
 493         int i;
 494
 495         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 496                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 497                     !strcmp(dev->name, s[i].name)) {
 498                         dev->irq        = s[i].map.irq;
 499                         dev->base_addr  = s[i].map.base_addr;
 500                         dev->mem_start  = s[i].map.mem_start;
 501                         dev->mem_end    = s[i].map.mem_end;
 502                         return 1;
 503                 }
 504         }
 505         return 0;
 506 }
 507 EXPORT_SYMBOL(netdev_boot_setup_check);
 508
 509
 510 /**
 511  *      netdev_boot_base        - get address from boot time settings
 512  *      @prefix: prefix for network device
 513  *      @unit: id for network device
 514  *
 515  *      Check boot time settings for the base address of device.
 516  *      The found settings are set for the device to be used
 517  *      later in the device probing.
 518  *      Returns 0 if no settings found.
 519  */
 520 unsigned long netdev_boot_base(const char *prefix, int unit)
 521 {
 522         const struct netdev_boot_setup *s = dev_boot_setup;
 523         char name[IFNAMSIZ];
 524         int i;
 525
 526         sprintf(name, "%s%d", prefix, unit);
 527
 528         /*
 529          * If device already registered then return base of 1
 530          * to indicate not to probe for this interface
 531          */
 532         if (__dev_get_by_name(&init_net, name))
 533                 return 1;
 534
 535         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 536                 if (!strcmp(name, s[i].name))
 537                         return s[i].map.base_addr;
 538         return 0;
 539 }
 540
 541 /*
 542  * Saves at boot time configured settings for any netdevice.
 543  */
 544 int __init netdev_boot_setup(char *str)
 545 {
 546         int ints[5];
 547         struct ifmap map;
 548
 549         str = get_options(str, ARRAY_SIZE(ints), ints);
 550         if (!str || !*str)
 551                 return 0;
 552
 553         /* Save settings */
 554         memset(&map, 0, sizeof(map));
 555         if (ints[0] > 0)
 556                 map.irq = ints[1];
 557         if (ints[0] > 1)
 558                 map.base_addr = ints[2];
 559         if (ints[0] > 2)
 560                 map.mem_start = ints[3];
 561         if (ints[0] > 3)
 562                 map.mem_end = ints[4];
 563
 564         /* Add new entry to the list */
 565         return netdev_boot_setup_add(str, &map);
 566 }
 567
 568 __setup("netdev=", netdev_boot_setup);
 569
 570 /*******************************************************************************
 571
 572                             Device Interface Subroutines
 573
 574 *******************************************************************************/
 575
 576 /**
 577  *      __dev_get_by_name       - find a device by its name
 578  *      @net: the applicable net namespace
 579  *      @name: name to find
 580  *
 581  *      Find an interface by name. Must be called under RTNL semaphore
 582  *      or @dev_base_lock. If the name is found a pointer to the device
 583  *      is returned. If the name is not found then %NULL is returned. The
 584  *      reference counters are not incremented so the caller must be
 585  *      careful with locks.
 586  */
 587
 588 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 589 {
 590         struct hlist_node *p;
 591         struct net_device *dev;
 592         struct hlist_head *head = dev_name_hash(net, name);
 593
 594         hlist_for_each_entry(dev, p, head, name_hlist)
 595                 if (!strncmp(dev->name, name, IFNAMSIZ))
 596                         return dev;
 597
 598         return NULL;
 599 }
 600 EXPORT_SYMBOL(__dev_get_by_name);
 601
 602 /**
 603  *      dev_get_by_name_rcu     - find a device by its name
 604  *      @net: the applicable net namespace
 605  *      @name: name to find
 606  *
 607  *      Find an interface by name.
 608  *      If the name is found a pointer to the device is returned.
 609  *      If the name is not found then %NULL is returned.
 610  *      The reference counters are not incremented so the caller must be
 611  *      careful with locks. The caller must hold RCU lock.
 612  */
 613
 614 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 615 {
 616         struct hlist_node *p;
 617         struct net_device *dev;
 618         struct hlist_head *head = dev_name_hash(net, name);
 619
 620         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 621                 if (!strncmp(dev->name, name, IFNAMSIZ))
 622                         return dev;
 623
 624         return NULL;
 625 }
 626 EXPORT_SYMBOL(dev_get_by_name_rcu);
 627
 628 /**
 629  *      dev_get_by_name         - find a device by its name
 630  *      @net: the applicable net namespace
 631  *      @name: name to find
 632  *
 633  *      Find an interface by name. This can be called from any
 634  *      context and does its own locking. The returned handle has
 635  *      the usage count incremented and the caller must use dev_put() to
 636  *      release it when it is no longer needed. %NULL is returned if no
 637  *      matching device is found.
 638  */
 639
 640 struct net_device *dev_get_by_name(struct net *net, const char *name)
 641 {
 642         struct net_device *dev;
 643
 644         rcu_read_lock();
 645         dev = dev_get_by_name_rcu(net, name);
 646         if (dev)
 647                 dev_hold(dev);
 648         rcu_read_unlock();
 649         return dev;
 650 }
 651 EXPORT_SYMBOL(dev_get_by_name);
 652
 653 /**
 654  *      __dev_get_by_index - find a device by its ifindex
 655  *      @net: the applicable net namespace
 656  *      @ifindex: index of device
 657  *
 658  *      Search for an interface by index. Returns %NULL if the device
 659  *      is not found or a pointer to the device. The device has not
 660  *      had its reference counter increased so the caller must be careful
 661  *      about locking. The caller must hold either the RTNL semaphore
 662  *      or @dev_base_lock.
 663  */
 664
 665 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 666 {
 667         struct hlist_node *p;
 668         struct net_device *dev;
 669         struct hlist_head *head = dev_index_hash(net, ifindex);
 670
 671         hlist_for_each_entry(dev, p, head, index_hlist)
 672                 if (dev->ifindex == ifindex)
 673                         return dev;
 674
 675         return NULL;
 676 }
 677 EXPORT_SYMBOL(__dev_get_by_index);
 678
 679 /**
 680  *      dev_get_by_index_rcu - find a device by its ifindex
 681  *      @net: the applicable net namespace
 682  *      @ifindex: index of device
 683  *
 684  *      Search for an interface by index. Returns %NULL if the device
 685  *      is not found or a pointer to the device. The device has not
 686  *      had its reference counter increased so the caller must be careful
 687  *      about locking. The caller must hold RCU lock.
 688  */
 689
 690 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 691 {
 692         struct hlist_node *p;
 693         struct net_device *dev;
 694         struct hlist_head *head = dev_index_hash(net, ifindex);
 695
 696         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 697                 if (dev->ifindex == ifindex)
 698                         return dev;
 699
 700         return NULL;
 701 }
 702 EXPORT_SYMBOL(dev_get_by_index_rcu);
 703
 704
 705 /**
 706  *      dev_get_by_index - find a device by its ifindex
 707  *      @net: the applicable net namespace
 708  *      @ifindex: index of device
 709  *
 710  *      Search for an interface by index. Returns NULL if the device
 711  *      is not found or a pointer to the device. The device returned has
 712  *      had a reference added and the pointer is safe until the user calls
 713  *      dev_put to indicate they have finished with it.
 714  */
 715
 716 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 717 {
 718         struct net_device *dev;
 719
 720         rcu_read_lock();
 721         dev = dev_get_by_index_rcu(net, ifindex);
 722         if (dev)
 723                 dev_hold(dev);
 724         rcu_read_unlock();
 725         return dev;
 726 }
 727 EXPORT_SYMBOL(dev_get_by_index);
 728
 729 /**
 730  *      dev_getbyhwaddr - find a device by its hardware address
 731  *      @net: the applicable net namespace
 732  *      @type: media type of device
 733  *      @ha: hardware address
 734  *
 735  *      Search for an interface by MAC address. Returns NULL if the device
 736  *      is not found or a pointer to the device. The caller must hold the
 737  *      rtnl semaphore. The returned device has not had its ref count increased
 738  *      and the caller must therefore be careful about locking
 739  *
 740  *      BUGS:
 741  *      If the API was consistent this would be __dev_get_by_hwaddr
 742  */
 743
 744 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 745 {
 746         struct net_device *dev;
 747
 748         ASSERT_RTNL();
 749
 750         for_each_netdev(net, dev)
 751                 if (dev->type == type &&
 752                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 753                         return dev;
 754
 755         return NULL;
 756 }
 757 EXPORT_SYMBOL(dev_getbyhwaddr);
 758
 759 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 760 {
 761         struct net_device *dev;
 762
 763         ASSERT_RTNL();
 764         for_each_netdev(net, dev)
 765                 if (dev->type == type)
 766                         return dev;
 767
 768         return NULL;
 769 }
 770 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 771
 772 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 773 {
 774         struct net_device *dev;
 775
 776         rtnl_lock();
 777         dev = __dev_getfirstbyhwtype(net, type);
 778         if (dev)
 779                 dev_hold(dev);
 780         rtnl_unlock();
 781         return dev;
 782 }
 783 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 784
 785 /**
 786  *      dev_get_by_flags - find any device with given flags
 787  *      @net: the applicable net namespace
 788  *      @if_flags: IFF_* values
 789  *      @mask: bitmask of bits in if_flags to check
 790  *
 791  *      Search for any interface with the given flags. Returns NULL if a device
 792  *      is not found or a pointer to the device. The device returned has
 793  *      had a reference added and the pointer is safe until the user calls
 794  *      dev_put to indicate they have finished with it.
 795  */
 796
 797 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 798                                     unsigned short mask)
 799 {
 800         struct net_device *dev, *ret;
 801
 802         ret = NULL;
 803         rcu_read_lock();
 804         for_each_netdev_rcu(net, dev) {
 805                 if (((dev->flags ^ if_flags) & mask) == 0) {
 806                         dev_hold(dev);
 807                         ret = dev;
 808                         break;
 809                 }
 810         }
 811         rcu_read_unlock();
 812         return ret;
 813 }
 814 EXPORT_SYMBOL(dev_get_by_flags);
 815
 816 /**
 817  *      dev_valid_name - check if name is okay for network device
 818  *      @name: name string
 819  *
 820  *      Network device names need to be valid file names to
 821  *      to allow sysfs to work.  We also disallow any kind of
 822  *      whitespace.
 823  */
 824 int dev_valid_name(const char *name)
 825 {
 826         if (*name == '\0')
 827                 return 0;
 828         if (strlen(name) >= IFNAMSIZ)
 829                 return 0;
 830         if (!strcmp(name, ".") || !strcmp(name, ".."))
 831                 return 0;
 832
 833         while (*name) {
 834                 if (*name == '/' || isspace(*name))
 835                         return 0;
 836                 name++;
 837         }
 838         return 1;
 839 }
 840 EXPORT_SYMBOL(dev_valid_name);
 841
 842 /**
 843  *      __dev_alloc_name - allocate a name for a device
 844  *      @net: network namespace to allocate the device name in
 845  *      @name: name format string
 846  *      @buf:  scratch buffer and result name string
 847  *
 848  *      Passed a format string - eg "lt%d" it will try and find a suitable
 849  *      id. It scans list of devices to build up a free map, then chooses
 850  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 851  *      while allocating the name and adding the device in order to avoid
 852  *      duplicates.
 853  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 854  *      Returns the number of the unit assigned or a negative errno code.
 855  */
 856
 857 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 858 {
 859         int i = 0;
 860         const char *p;
 861         const int max_netdevices = 8*PAGE_SIZE;
 862         unsigned long *inuse;
 863         struct net_device *d;
 864
 865         p = strnchr(name, IFNAMSIZ-1, '%');
 866         if (p) {
 867                 /*
 868                  * Verify the string as this thing may have come from
 869                  * the user.  There must be either one "%d" and no other "%"
 870                  * characters.
 871                  */
 872                 if (p[1] != 'd' || strchr(p + 2, '%'))
 873                         return -EINVAL;
 874
 875                 /* Use one page as a bit array of possible slots */
 876                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 877                 if (!inuse)
 878                         return -ENOMEM;
 879
 880                 for_each_netdev(net, d) {
 881                         if (!sscanf(d->name, name, &i))
 882                                 continue;
 883                         if (i < 0 || i >= max_netdevices)
 884                                 continue;
 885
 886                         /*  avoid cases where sscanf is not exact inverse of printf */
 887                         snprintf(buf, IFNAMSIZ, name, i);
 888                         if (!strncmp(buf, d->name, IFNAMSIZ))
 889                                 set_bit(i, inuse);
 890                 }
 891
 892                 i = find_first_zero_bit(inuse, max_netdevices);
 893                 free_page((unsigned long) inuse);
 894         }
 895
 896         snprintf(buf, IFNAMSIZ, name, i);
 897         if (!__dev_get_by_name(net, buf))
 898                 return i;
 899
 900         /* It is possible to run out of possible slots
 901          * when the name is long and there isn't enough space left
 902          * for the digits, or if all bits are used.
 903          */
 904         return -ENFILE;
 905 }
 906
 907 /**
 908  *      dev_alloc_name - allocate a name for a device
 909  *      @dev: device
 910  *      @name: name format string
 911  *
 912  *      Passed a format string - eg "lt%d" it will try and find a suitable
 913  *      id. It scans list of devices to build up a free map, then chooses
 914  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 915  *      while allocating the name and adding the device in order to avoid
 916  *      duplicates.
 917  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 918  *      Returns the number of the unit assigned or a negative errno code.
 919  */
 920
 921 int dev_alloc_name(struct net_device *dev, const char *name)
 922 {
 923         char buf[IFNAMSIZ];
 924         struct net *net;
 925         int ret;
 926
 927         BUG_ON(!dev_net(dev));
 928         net = dev_net(dev);
 929         ret = __dev_alloc_name(net, name, buf);
 930         if (ret >= 0)
 931                 strlcpy(dev->name, buf, IFNAMSIZ);
 932         return ret;
 933 }
 934 EXPORT_SYMBOL(dev_alloc_name);
 935
 936
 937 /**
 938  *      dev_change_name - change name of a device
 939  *      @dev: device
 940  *      @newname: name (or format string) must be at least IFNAMSIZ
 941  *
 942  *      Change name of a device, can pass format strings "eth%d".
 943  *      for wildcarding.
 944  */
 945 int dev_change_name(struct net_device *dev, const char *newname)
 946 {
 947         char oldname[IFNAMSIZ];
 948         int err = 0;
 949         int ret;
 950         struct net *net;
 951
 952         ASSERT_RTNL();
 953         BUG_ON(!dev_net(dev));
 954
 955         net = dev_net(dev);
 956         if (dev->flags & IFF_UP)
 957                 return -EBUSY;
 958
 959         if (!dev_valid_name(newname))
 960                 return -EINVAL;
 961
 962         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 963                 return 0;
 964
 965         memcpy(oldname, dev->name, IFNAMSIZ);
 966
 967         if (strchr(newname, '%')) {
 968                 err = dev_alloc_name(dev, newname);
 969                 if (err < 0)
 970                         return err;
 971         } else if (__dev_get_by_name(net, newname))
 972                 return -EEXIST;
 973         else
 974                 strlcpy(dev->name, newname, IFNAMSIZ);
 975
 976 rollback:
 977         /* For now only devices in the initial network namespace
 978          * are in sysfs.
 979          */
 980         if (net == &init_net) {
 981                 ret = device_rename(&dev->dev, dev->name);
 982                 if (ret) {
 983                         memcpy(dev->name, oldname, IFNAMSIZ);
 984                         return ret;
 985                 }
 986         }
 987
 988         write_lock_bh(&dev_base_lock);
 989         hlist_del(&dev->name_hlist);
 990         write_unlock_bh(&dev_base_lock);
 991
 992         synchronize_rcu();
 993
 994         write_lock_bh(&dev_base_lock);
 995         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 996         write_unlock_bh(&dev_base_lock);
 997
 998         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 999         ret = notifier_to_errno(ret);
1000
1001         if (ret) {
1002                 if (err) {
1003                         printk(KERN_ERR
1004                                "%s: name change rollback failed: %d.\n",
1005                                dev->name, ret);
1006                 } else {
1007                         err = ret;
1008                         memcpy(dev->name, oldname, IFNAMSIZ);
1009                         goto rollback;
1010                 }
1011         }
1012
1013         return err;
1014 }
1015
1016 /**
1017  *      dev_set_alias - change ifalias of a device
1018  *      @dev: device
1019  *      @alias: name up to IFALIASZ
1020  *      @len: limit of bytes to copy from info
1021  *
1022  *      Set ifalias for a device,
1023  */
1024 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1025 {
1026         ASSERT_RTNL();
1027
1028         if (len >= IFALIASZ)
1029                 return -EINVAL;
1030
1031         if (!len) {
1032                 if (dev->ifalias) {
1033                         kfree(dev->ifalias);
1034                         dev->ifalias = NULL;
1035                 }
1036                 return 0;
1037         }
1038
1039         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1040         if (!dev->ifalias)
1041                 return -ENOMEM;
1042
1043         strlcpy(dev->ifalias, alias, len+1);
1044         return len;
1045 }
1046
1047
1048 /**
1049  *      netdev_features_change - device changes features
1050  *      @dev: device to cause notification
1051  *
1052  *      Called to indicate a device has changed features.
1053  */
1054 void netdev_features_change(struct net_device *dev)
1055 {
1056         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1057 }
1058 EXPORT_SYMBOL(netdev_features_change);
1059
1060 /**
1061  *      netdev_state_change - device changes state
1062  *      @dev: device to cause notification
1063  *
1064  *      Called to indicate a device has changed state. This function calls
1065  *      the notifier chains for netdev_chain and sends a NEWLINK message
1066  *      to the routing socket.
1067  */
1068 void netdev_state_change(struct net_device *dev)
1069 {
1070         if (dev->flags & IFF_UP) {
1071                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1072                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1073         }
1074 }
1075 EXPORT_SYMBOL(netdev_state_change);
1076
1077 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1078 {
1079         call_netdevice_notifiers(event, dev);
1080 }
1081 EXPORT_SYMBOL(netdev_bonding_change);
1082
1083 /**
1084  *      dev_load        - load a network module
1085  *      @net: the applicable net namespace
1086  *      @name: name of interface
1087  *
1088  *      If a network interface is not present and the process has suitable
1089  *      privileges this function loads the module. If module loading is not
1090  *      available in this kernel then it becomes a nop.
1091  */
1092
1093 void dev_load(struct net *net, const char *name)
1094 {
1095         struct net_device *dev;
1096
1097         rcu_read_lock();
1098         dev = dev_get_by_name_rcu(net, name);
1099         rcu_read_unlock();
1100
1101         if (!dev && capable(CAP_NET_ADMIN))
1102                 request_module("%s", name);
1103 }
1104 EXPORT_SYMBOL(dev_load);
1105
1106 /**
1107  *      dev_open        - prepare an interface for use.
1108  *      @dev:   device to open
1109  *
1110  *      Takes a device from down to up state. The device's private open
1111  *      function is invoked and then the multicast lists are loaded. Finally
1112  *      the device is moved into the up state and a %NETDEV_UP message is
1113  *      sent to the netdev notifier chain.
1114  *
1115  *      Calling this function on an active interface is a nop. On a failure
1116  *      a negative errno code is returned.
1117  */
1118 int dev_open(struct net_device *dev)
1119 {
1120         const struct net_device_ops *ops = dev->netdev_ops;
1121         int ret;
1122
1123         ASSERT_RTNL();
1124
1125         /*
1126          *      Is it already up?
1127          */
1128
1129         if (dev->flags & IFF_UP)
1130                 return 0;
1131
1132         /*
1133          *      Is it even present?
1134          */
1135         if (!netif_device_present(dev))
1136                 return -ENODEV;
1137
1138         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1139         ret = notifier_to_errno(ret);
1140         if (ret)
1141                 return ret;
1142
1143         /*
1144          *      Call device private open method
1145          */
1146         set_bit(__LINK_STATE_START, &dev->state);
1147
1148         if (ops->ndo_validate_addr)
1149                 ret = ops->ndo_validate_addr(dev);
1150
1151         if (!ret && ops->ndo_open)
1152                 ret = ops->ndo_open(dev);
1153
1154         /*
1155          *      If it went open OK then:
1156          */
1157
1158         if (ret)
1159                 clear_bit(__LINK_STATE_START, &dev->state);
1160         else {
1161                 /*
1162                  *      Set the flags.
1163                  */
1164                 dev->flags |= IFF_UP;
1165
1166                 /*
1167                  *      Enable NET_DMA
1168                  */
1169                 net_dmaengine_get();
1170
1171                 /*
1172                  *      Initialize multicasting status
1173                  */
1174                 dev_set_rx_mode(dev);
1175
1176                 /*
1177                  *      Wakeup transmit queue engine
1178                  */
1179                 dev_activate(dev);
1180
1181                 /*
1182                  *      ... and announce new interface.
1183                  */
1184                 call_netdevice_notifiers(NETDEV_UP, dev);
1185         }
1186
1187         return ret;
1188 }
1189 EXPORT_SYMBOL(dev_open);
1190
1191 /**
1192  *      dev_close - shutdown an interface.
1193  *      @dev: device to shutdown
1194  *
1195  *      This function moves an active device into down state. A
1196  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1197  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1198  *      chain.
1199  */
1200 int dev_close(struct net_device *dev)
1201 {
1202         const struct net_device_ops *ops = dev->netdev_ops;
1203         ASSERT_RTNL();
1204
1205         might_sleep();
1206
1207         if (!(dev->flags & IFF_UP))
1208                 return 0;
1209
1210         /*
1211          *      Tell people we are going down, so that they can
1212          *      prepare to death, when device is still operating.
1213          */
1214         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1215
1216         clear_bit(__LINK_STATE_START, &dev->state);
1217
1218         /* Synchronize to scheduled poll. We cannot touch poll list,
1219          * it can be even on different cpu. So just clear netif_running().
1220          *
1221          * dev->stop() will invoke napi_disable() on all of it's
1222          * napi_struct instances on this device.
1223          */
1224         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1225
1226         dev_deactivate(dev);
1227
1228         /*
1229          *      Call the device specific close. This cannot fail.
1230          *      Only if device is UP
1231          *
1232          *      We allow it to be called even after a DETACH hot-plug
1233          *      event.
1234          */
1235         if (ops->ndo_stop)
1236                 ops->ndo_stop(dev);
1237
1238         /*
1239          *      Device is now down.
1240          */
1241
1242         dev->flags &= ~IFF_UP;
1243
1244         /*
1245          * Tell people we are down
1246          */
1247         call_netdevice_notifiers(NETDEV_DOWN, dev);
1248
1249         /*
1250          *      Shutdown NET_DMA
1251          */
1252         net_dmaengine_put();
1253
1254         return 0;
1255 }
1256 EXPORT_SYMBOL(dev_close);
1257
1258
1259 /**
1260  *      dev_disable_lro - disable Large Receive Offload on a device
1261  *      @dev: device
1262  *
1263  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1264  *      called under RTNL.  This is needed if received packets may be
1265  *      forwarded to another interface.
1266  */
1267 void dev_disable_lro(struct net_device *dev)
1268 {
1269         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1270             dev->ethtool_ops->set_flags) {
1271                 u32 flags = dev->ethtool_ops->get_flags(dev);
1272                 if (flags & ETH_FLAG_LRO) {
1273                         flags &= ~ETH_FLAG_LRO;
1274                         dev->ethtool_ops->set_flags(dev, flags);
1275                 }
1276         }
1277         WARN_ON(dev->features & NETIF_F_LRO);
1278 }
1279 EXPORT_SYMBOL(dev_disable_lro);
1280
1281
1282 static int dev_boot_phase = 1;
1283
1284 /*
1285  *      Device change register/unregister. These are not inline or static
1286  *      as we export them to the world.
1287  */
1288
1289 /**
1290  *      register_netdevice_notifier - register a network notifier block
1291  *      @nb: notifier
1292  *
1293  *      Register a notifier to be called when network device events occur.
1294  *      The notifier passed is linked into the kernel structures and must
1295  *      not be reused until it has been unregistered. A negative errno code
1296  *      is returned on a failure.
1297  *
1298  *      When registered all registration and up events are replayed
1299  *      to the new notifier to allow device to have a race free
1300  *      view of the network device list.
1301  */
1302
1303 int register_netdevice_notifier(struct notifier_block *nb)
1304 {
1305         struct net_device *dev;
1306         struct net_device *last;
1307         struct net *net;
1308         int err;
1309
1310         rtnl_lock();
1311         err = raw_notifier_chain_register(&netdev_chain, nb);
1312         if (err)
1313                 goto unlock;
1314         if (dev_boot_phase)
1315                 goto unlock;
1316         for_each_net(net) {
1317                 for_each_netdev(net, dev) {
1318                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1319                         err = notifier_to_errno(err);
1320                         if (err)
1321                                 goto rollback;
1322
1323                         if (!(dev->flags & IFF_UP))
1324                                 continue;
1325
1326                         nb->notifier_call(nb, NETDEV_UP, dev);
1327                 }
1328         }
1329
1330 unlock:
1331         rtnl_unlock();
1332         return err;
1333
1334 rollback:
1335         last = dev;
1336         for_each_net(net) {
1337                 for_each_netdev(net, dev) {
1338                         if (dev == last)
1339                                 break;
1340
1341                         if (dev->flags & IFF_UP) {
1342                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1343                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1344                         }
1345                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1346                 }
1347         }
1348
1349         raw_notifier_chain_unregister(&netdev_chain, nb);
1350         goto unlock;
1351 }
1352 EXPORT_SYMBOL(register_netdevice_notifier);
1353
1354 /**
1355  *      unregister_netdevice_notifier - unregister a network notifier block
1356  *      @nb: notifier
1357  *
1358  *      Unregister a notifier previously registered by
1359  *      register_netdevice_notifier(). The notifier is unlinked into the
1360  *      kernel structures and may then be reused. A negative errno code
1361  *      is returned on a failure.
1362  */
1363
1364 int unregister_netdevice_notifier(struct notifier_block *nb)
1365 {
1366         int err;
1367
1368         rtnl_lock();
1369         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1370         rtnl_unlock();
1371         return err;
1372 }
1373 EXPORT_SYMBOL(unregister_netdevice_notifier);
1374
1375 /**
1376  *      call_netdevice_notifiers - call all network notifier blocks
1377  *      @val: value passed unmodified to notifier function
1378  *      @dev: net_device pointer passed unmodified to notifier function
1379  *
1380  *      Call all network notifier blocks.  Parameters and return value
1381  *      are as for raw_notifier_call_chain().
1382  */
1383
1384 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1385 {
1386         return raw_notifier_call_chain(&netdev_chain, val, dev);
1387 }
1388
1389 /* When > 0 there are consumers of rx skb time stamps */
1390 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1391
1392 void net_enable_timestamp(void)
1393 {
1394         atomic_inc(&netstamp_needed);
1395 }
1396 EXPORT_SYMBOL(net_enable_timestamp);
1397
1398 void net_disable_timestamp(void)
1399 {
1400         atomic_dec(&netstamp_needed);
1401 }
1402 EXPORT_SYMBOL(net_disable_timestamp);
1403
1404 static inline void net_timestamp(struct sk_buff *skb)
1405 {
1406         if (atomic_read(&netstamp_needed))
1407                 __net_timestamp(skb);
1408         else
1409                 skb->tstamp.tv64 = 0;
1410 }
1411
1412 /*
1413  *      Support routine. Sends outgoing frames to any network
1414  *      taps currently in use.
1415  */
1416
1417 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1418 {
1419         struct packet_type *ptype;
1420
1421 #ifdef CONFIG_NET_CLS_ACT
1422         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1423                 net_timestamp(skb);
1424 #else
1425         net_timestamp(skb);
1426 #endif
1427
1428         rcu_read_lock();
1429         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1430                 /* Never send packets back to the socket
1431                  * they originated from - MvS (miquels@drinkel.ow.org)
1432                  */
1433                 if ((ptype->dev == dev || !ptype->dev) &&
1434                     (ptype->af_packet_priv == NULL ||
1435                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1436                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1437                         if (!skb2)
1438                                 break;
1439
1440                         /* skb->nh should be correctly
1441                            set by sender, so that the second statement is
1442                            just protection against buggy protocols.
1443                          */
1444                         skb_reset_mac_header(skb2);
1445
1446                         if (skb_network_header(skb2) < skb2->data ||
1447                             skb2->network_header > skb2->tail) {
1448                                 if (net_ratelimit())
1449                                         printk(KERN_CRIT "protocol %04x is "
1450                                                "buggy, dev %s\n",
1451                                                skb2->protocol, dev->name);
1452                                 skb_reset_network_header(skb2);
1453                         }
1454
1455                         skb2->transport_header = skb2->network_header;
1456                         skb2->pkt_type = PACKET_OUTGOING;
1457                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1458                 }
1459         }
1460         rcu_read_unlock();
1461 }
1462
1463
1464 static inline void __netif_reschedule(struct Qdisc *q)
1465 {
1466         struct softnet_data *sd;
1467         unsigned long flags;
1468
1469         local_irq_save(flags);
1470         sd = &__get_cpu_var(softnet_data);
1471         q->next_sched = sd->output_queue;
1472         sd->output_queue = q;
1473         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1474         local_irq_restore(flags);
1475 }
1476
1477 void __netif_schedule(struct Qdisc *q)
1478 {
1479         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1480                 __netif_reschedule(q);
1481 }
1482 EXPORT_SYMBOL(__netif_schedule);
1483
1484 void dev_kfree_skb_irq(struct sk_buff *skb)
1485 {
1486         if (atomic_dec_and_test(&skb->users)) {
1487                 struct softnet_data *sd;
1488                 unsigned long flags;
1489
1490                 local_irq_save(flags);
1491                 sd = &__get_cpu_var(softnet_data);
1492                 skb->next = sd->completion_queue;
1493                 sd->completion_queue = skb;
1494                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1495                 local_irq_restore(flags);
1496         }
1497 }
1498 EXPORT_SYMBOL(dev_kfree_skb_irq);
1499
1500 void dev_kfree_skb_any(struct sk_buff *skb)
1501 {
1502         if (in_irq() || irqs_disabled())
1503                 dev_kfree_skb_irq(skb);
1504         else
1505                 dev_kfree_skb(skb);
1506 }
1507 EXPORT_SYMBOL(dev_kfree_skb_any);
1508
1509
1510 /**
1511  * netif_device_detach - mark device as removed
1512  * @dev: network device
1513  *
1514  * Mark device as removed from system and therefore no longer available.
1515  */
1516 void netif_device_detach(struct net_device *dev)
1517 {
1518         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1519             netif_running(dev)) {
1520                 netif_tx_stop_all_queues(dev);
1521         }
1522 }
1523 EXPORT_SYMBOL(netif_device_detach);
1524
1525 /**
1526  * netif_device_attach - mark device as attached
1527  * @dev: network device
1528  *
1529  * Mark device as attached from system and restart if needed.
1530  */
1531 void netif_device_attach(struct net_device *dev)
1532 {
1533         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1534             netif_running(dev)) {
1535                 netif_tx_wake_all_queues(dev);
1536                 __netdev_watchdog_up(dev);
1537         }
1538 }
1539 EXPORT_SYMBOL(netif_device_attach);
1540
1541 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1542 {
1543         return ((features & NETIF_F_GEN_CSUM) ||
1544                 ((features & NETIF_F_IP_CSUM) &&
1545                  protocol == htons(ETH_P_IP)) ||
1546                 ((features & NETIF_F_IPV6_CSUM) &&
1547                  protocol == htons(ETH_P_IPV6)) ||
1548                 ((features & NETIF_F_FCOE_CRC) &&
1549                  protocol == htons(ETH_P_FCOE)));
1550 }
1551
1552 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1553 {
1554         if (can_checksum_protocol(dev->features, skb->protocol))
1555                 return true;
1556
1557         if (skb->protocol == htons(ETH_P_8021Q)) {
1558                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1559                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1560                                           veh->h_vlan_encapsulated_proto))
1561                         return true;
1562         }
1563
1564         return false;
1565 }
1566
1567 /*
1568  * Invalidate hardware checksum when packet is to be mangled, and
1569  * complete checksum manually on outgoing path.
1570  */
1571 int skb_checksum_help(struct sk_buff *skb)
1572 {
1573         __wsum csum;
1574         int ret = 0, offset;
1575
1576         if (skb->ip_summed == CHECKSUM_COMPLETE)
1577                 goto out_set_summed;
1578
1579         if (unlikely(skb_shinfo(skb)->gso_size)) {
1580                 /* Let GSO fix up the checksum. */
1581                 goto out_set_summed;
1582         }
1583
1584         offset = skb->csum_start - skb_headroom(skb);
1585         BUG_ON(offset >= skb_headlen(skb));
1586         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1587
1588         offset += skb->csum_offset;
1589         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1590
1591         if (skb_cloned(skb) &&
1592             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1593                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1594                 if (ret)
1595                         goto out;
1596         }
1597
1598         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1599 out_set_summed:
1600         skb->ip_summed = CHECKSUM_NONE;
1601 out:
1602         return ret;
1603 }
1604 EXPORT_SYMBOL(skb_checksum_help);
1605
1606 /**
1607  *      skb_gso_segment - Perform segmentation on skb.
1608  *      @skb: buffer to segment
1609  *      @features: features for the output path (see dev->features)
1610  *
1611  *      This function segments the given skb and returns a list of segments.
1612  *
1613  *      It may return NULL if the skb requires no segmentation.  This is
1614  *      only possible when GSO is used for verifying header integrity.
1615  */
1616 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1617 {
1618         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1619         struct packet_type *ptype;
1620         __be16 type = skb->protocol;
1621         int err;
1622
1623         skb_reset_mac_header(skb);
1624         skb->mac_len = skb->network_header - skb->mac_header;
1625         __skb_pull(skb, skb->mac_len);
1626
1627         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1628                 struct net_device *dev = skb->dev;
1629                 struct ethtool_drvinfo info = {};
1630
1631                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1632                         dev->ethtool_ops->get_drvinfo(dev, &info);
1633
1634                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1635                         "ip_summed=%d",
1636                      info.driver, dev ? dev->features : 0L,
1637                      skb->sk ? skb->sk->sk_route_caps : 0L,
1638                      skb->len, skb->data_len, skb->ip_summed);
1639
1640                 if (skb_header_cloned(skb) &&
1641                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1642                         return ERR_PTR(err);
1643         }
1644
1645         rcu_read_lock();
1646         list_for_each_entry_rcu(ptype,
1647                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1648                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1649                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1650                                 err = ptype->gso_send_check(skb);
1651                                 segs = ERR_PTR(err);
1652                                 if (err || skb_gso_ok(skb, features))
1653                                         break;
1654                                 __skb_push(skb, (skb->data -
1655                                                  skb_network_header(skb)));
1656                         }
1657                         segs = ptype->gso_segment(skb, features);
1658                         break;
1659                 }
1660         }
1661         rcu_read_unlock();
1662
1663         __skb_push(skb, skb->data - skb_mac_header(skb));
1664
1665         return segs;
1666 }
1667 EXPORT_SYMBOL(skb_gso_segment);
1668
1669 /* Take action when hardware reception checksum errors are detected. */
1670 #ifdef CONFIG_BUG
1671 void netdev_rx_csum_fault(struct net_device *dev)
1672 {
1673         if (net_ratelimit()) {
1674                 printk(KERN_ERR "%s: hw csum failure.\n",
1675                         dev ? dev->name : "<unknown>");
1676                 dump_stack();
1677         }
1678 }
1679 EXPORT_SYMBOL(netdev_rx_csum_fault);
1680 #endif
1681
1682 /* Actually, we should eliminate this check as soon as we know, that:
1683  * 1. IOMMU is present and allows to map all the memory.
1684  * 2. No high memory really exists on this machine.
1685  */
1686
1687 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1688 {
1689 #ifdef CONFIG_HIGHMEM
1690         int i;
1691
1692         if (dev->features & NETIF_F_HIGHDMA)
1693                 return 0;
1694
1695         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1696                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1697                         return 1;
1698
1699 #endif
1700         return 0;
1701 }
1702
1703 struct dev_gso_cb {
1704         void (*destructor)(struct sk_buff *skb);
1705 };
1706
1707 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1708
1709 static void dev_gso_skb_destructor(struct sk_buff *skb)
1710 {
1711         struct dev_gso_cb *cb;
1712
1713         do {
1714                 struct sk_buff *nskb = skb->next;
1715
1716                 skb->next = nskb->next;
1717                 nskb->next = NULL;
1718                 kfree_skb(nskb);
1719         } while (skb->next);
1720
1721         cb = DEV_GSO_CB(skb);
1722         if (cb->destructor)
1723                 cb->destructor(skb);
1724 }
1725
1726 /**
1727  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1728  *      @skb: buffer to segment
1729  *
1730  *      This function segments the given skb and stores the list of segments
1731  *      in skb->next.
1732  */
1733 static int dev_gso_segment(struct sk_buff *skb)
1734 {
1735         struct net_device *dev = skb->dev;
1736         struct sk_buff *segs;
1737         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1738                                          NETIF_F_SG : 0);
1739
1740         segs = skb_gso_segment(skb, features);
1741
1742         /* Verifying header integrity only. */
1743         if (!segs)
1744                 return 0;
1745
1746         if (IS_ERR(segs))
1747                 return PTR_ERR(segs);
1748
1749         skb->next = segs;
1750         DEV_GSO_CB(skb)->destructor = skb->destructor;
1751         skb->destructor = dev_gso_skb_destructor;
1752
1753         return 0;
1754 }
1755
1756 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1757                         struct netdev_queue *txq)
1758 {
1759         const struct net_device_ops *ops = dev->netdev_ops;
1760         int rc = NETDEV_TX_OK;
1761
1762         if (likely(!skb->next)) {
1763                 if (!list_empty(&ptype_all))
1764                         dev_queue_xmit_nit(skb, dev);
1765
1766                 if (netif_needs_gso(dev, skb)) {
1767                         if (unlikely(dev_gso_segment(skb)))
1768                                 goto out_kfree_skb;
1769                         if (skb->next)
1770                                 goto gso;
1771                 }
1772
1773                 /*
1774                  * If device doesnt need skb->dst, release it right now while
1775                  * its hot in this cpu cache
1776                  */
1777                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1778                         skb_dst_drop(skb);
1779
1780                 rc = ops->ndo_start_xmit(skb, dev);
1781                 if (rc == NETDEV_TX_OK)
1782                         txq_trans_update(txq);
1783                 /*
1784                  * TODO: if skb_orphan() was called by
1785                  * dev->hard_start_xmit() (for example, the unmodified
1786                  * igb driver does that; bnx2 doesn't), then
1787                  * skb_tx_software_timestamp() will be unable to send
1788                  * back the time stamp.
1789                  *
1790                  * How can this be prevented? Always create another
1791                  * reference to the socket before calling
1792                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1793                  * does anything in dev->hard_start_xmit() by clearing
1794                  * the skb destructor before the call and restoring it
1795                  * afterwards, then doing the skb_orphan() ourselves?
1796                  */
1797                 return rc;
1798         }
1799
1800 gso:
1801         do {
1802                 struct sk_buff *nskb = skb->next;
1803
1804                 skb->next = nskb->next;
1805                 nskb->next = NULL;
1806                 rc = ops->ndo_start_xmit(nskb, dev);
1807                 if (unlikely(rc != NETDEV_TX_OK)) {
1808                         if (rc & ~NETDEV_TX_MASK)
1809                                 goto out_kfree_gso_skb;
1810                         nskb->next = skb->next;
1811                         skb->next = nskb;
1812                         return rc;
1813                 }
1814                 txq_trans_update(txq);
1815                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1816                         return NETDEV_TX_BUSY;
1817         } while (skb->next);
1818
1819 out_kfree_gso_skb:
1820         if (likely(skb->next == NULL))
1821                 skb->destructor = DEV_GSO_CB(skb)->destructor;
1822 out_kfree_skb:
1823         kfree_skb(skb);
1824         return rc;
1825 }
1826
1827 static u32 skb_tx_hashrnd;
1828
1829 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1830 {
1831         u32 hash;
1832
1833         if (skb_rx_queue_recorded(skb)) {
1834                 hash = skb_get_rx_queue(skb);
1835                 while (unlikely(hash >= dev->real_num_tx_queues))
1836                         hash -= dev->real_num_tx_queues;
1837                 return hash;
1838         }
1839
1840         if (skb->sk && skb->sk->sk_hash)
1841                 hash = skb->sk->sk_hash;
1842         else
1843                 hash = skb->protocol;
1844
1845         hash = jhash_1word(hash, skb_tx_hashrnd);
1846
1847         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1848 }
1849 EXPORT_SYMBOL(skb_tx_hash);
1850
1851 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1852 {
1853         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1854                 if (net_ratelimit()) {
1855                         WARN(1, "%s selects TX queue %d, but "
1856                              "real number of TX queues is %d\n",
1857                              dev->name, queue_index,
1858                              dev->real_num_tx_queues);
1859                 }
1860                 return 0;
1861         }
1862         return queue_index;
1863 }
1864
1865 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1866                                         struct sk_buff *skb)
1867 {
1868         u16 queue_index;
1869         struct sock *sk = skb->sk;
1870
1871         if (sk_tx_queue_recorded(sk)) {
1872                 queue_index = sk_tx_queue_get(sk);
1873         } else {
1874                 const struct net_device_ops *ops = dev->netdev_ops;
1875
1876                 if (ops->ndo_select_queue) {
1877                         queue_index = ops->ndo_select_queue(dev, skb);
1878                         queue_index = dev_cap_txqueue(dev, queue_index);
1879                 } else {
1880                         queue_index = 0;
1881                         if (dev->real_num_tx_queues > 1)
1882                                 queue_index = skb_tx_hash(dev, skb);
1883
1884                         if (sk && sk->sk_dst_cache)
1885                                 sk_tx_queue_set(sk, queue_index);
1886                 }
1887         }
1888
1889         skb_set_queue_mapping(skb, queue_index);
1890         return netdev_get_tx_queue(dev, queue_index);
1891 }
1892
1893 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1894                                  struct net_device *dev,
1895                                  struct netdev_queue *txq)
1896 {
1897         spinlock_t *root_lock = qdisc_lock(q);
1898         int rc;
1899
1900         spin_lock(root_lock);
1901         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1902                 kfree_skb(skb);
1903                 rc = NET_XMIT_DROP;
1904         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1905                    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
1906                 /*
1907                  * This is a work-conserving queue; there are no old skbs
1908                  * waiting to be sent out; and the qdisc is not running -
1909                  * xmit the skb directly.
1910                  */
1911                 __qdisc_update_bstats(q, skb->len);
1912                 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
1913                         __qdisc_run(q);
1914                 else
1915                         clear_bit(__QDISC_STATE_RUNNING, &q->state);
1916
1917                 rc = NET_XMIT_SUCCESS;
1918         } else {
1919                 rc = qdisc_enqueue_root(skb, q);
1920                 qdisc_run(q);
1921         }
1922         spin_unlock(root_lock);
1923
1924         return rc;
1925 }
1926
1927 /**
1928  *      dev_queue_xmit - transmit a buffer
1929  *      @skb: buffer to transmit
1930  *
1931  *      Queue a buffer for transmission to a network device. The caller must
1932  *      have set the device and priority and built the buffer before calling
1933  *      this function. The function can be called from an interrupt.
1934  *
1935  *      A negative errno code is returned on a failure. A success does not
1936  *      guarantee the frame will be transmitted as it may be dropped due
1937  *      to congestion or traffic shaping.
1938  *
1939  * -----------------------------------------------------------------------------------
1940  *      I notice this method can also return errors from the queue disciplines,
1941  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1942  *      be positive.
1943  *
1944  *      Regardless of the return value, the skb is consumed, so it is currently
1945  *      difficult to retry a send to this method.  (You can bump the ref count
1946  *      before sending to hold a reference for retry if you are careful.)
1947  *
1948  *      When calling this method, interrupts MUST be enabled.  This is because
1949  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1950  *          --BLG
1951  */
1952 int dev_queue_xmit(struct sk_buff *skb)
1953 {
1954         struct net_device *dev = skb->dev;
1955         struct netdev_queue *txq;
1956         struct Qdisc *q;
1957         int rc = -ENOMEM;
1958
1959         /* GSO will handle the following emulations directly. */
1960         if (netif_needs_gso(dev, skb))
1961                 goto gso;
1962
1963         if (skb_has_frags(skb) &&
1964             !(dev->features & NETIF_F_FRAGLIST) &&
1965             __skb_linearize(skb))
1966                 goto out_kfree_skb;
1967
1968         /* Fragmented skb is linearized if device does not support SG,
1969          * or if at least one of fragments is in highmem and device
1970          * does not support DMA from it.
1971          */
1972         if (skb_shinfo(skb)->nr_frags &&
1973             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1974             __skb_linearize(skb))
1975                 goto out_kfree_skb;
1976
1977         /* If packet is not checksummed and device does not support
1978          * checksumming for this protocol, complete checksumming here.
1979          */
1980         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1981                 skb_set_transport_header(skb, skb->csum_start -
1982                                               skb_headroom(skb));
1983                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1984                         goto out_kfree_skb;
1985         }
1986
1987 gso:
1988         /* Disable soft irqs for various locks below. Also
1989          * stops preemption for RCU.
1990          */
1991         rcu_read_lock_bh();
1992
1993         txq = dev_pick_tx(dev, skb);
1994         q = rcu_dereference(txq->qdisc);
1995
1996 #ifdef CONFIG_NET_CLS_ACT
1997         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1998 #endif
1999         if (q->enqueue) {
2000                 rc = __dev_xmit_skb(skb, q, dev, txq);
2001                 goto out;
2002         }
2003
2004         /* The device has no queue. Common case for software devices:
2005            loopback, all the sorts of tunnels...
2006
2007            Really, it is unlikely that netif_tx_lock protection is necessary
2008            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2009            counters.)
2010            However, it is possible, that they rely on protection
2011            made by us here.
2012
2013            Check this and shot the lock. It is not prone from deadlocks.
2014            Either shot noqueue qdisc, it is even simpler 8)
2015          */
2016         if (dev->flags & IFF_UP) {
2017                 int cpu = smp_processor_id(); /* ok because BHs are off */
2018
2019                 if (txq->xmit_lock_owner != cpu) {
2020
2021                         HARD_TX_LOCK(dev, txq, cpu);
2022
2023                         if (!netif_tx_queue_stopped(txq)) {
2024                                 rc = dev_hard_start_xmit(skb, dev, txq);
2025                                 if (dev_xmit_complete(rc)) {
2026                                         HARD_TX_UNLOCK(dev, txq);
2027                                         goto out;
2028                                 }
2029                         }
2030                         HARD_TX_UNLOCK(dev, txq);
2031                         if (net_ratelimit())
2032                                 printk(KERN_CRIT "Virtual device %s asks to "
2033                                        "queue packet!\n", dev->name);
2034                 } else {
2035                         /* Recursion is detected! It is possible,
2036                          * unfortunately */
2037                         if (net_ratelimit())
2038                                 printk(KERN_CRIT "Dead loop on virtual device "
2039                                        "%s, fix it urgently!\n", dev->name);
2040                 }
2041         }
2042
2043         rc = -ENETDOWN;
2044         rcu_read_unlock_bh();
2045
2046 out_kfree_skb:
2047         kfree_skb(skb);
2048         return rc;
2049 out:
2050         rcu_read_unlock_bh();
2051         return rc;
2052 }
2053 EXPORT_SYMBOL(dev_queue_xmit);
2054
2055
2056 /*=======================================================================
2057                         Receiver routines
2058   =======================================================================*/
2059
2060 int netdev_max_backlog __read_mostly = 1000;
2061 int netdev_budget __read_mostly = 300;
2062 int weight_p __read_mostly = 64;            /* old backlog weight */
2063
2064 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2065
2066
2067 /**
2068  *      netif_rx        -       post buffer to the network code
2069  *      @skb: buffer to post
2070  *
2071  *      This function receives a packet from a device driver and queues it for
2072  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2073  *      may be dropped during processing for congestion control or by the
2074  *      protocol layers.
2075  *
2076  *      return values:
2077  *      NET_RX_SUCCESS  (no congestion)
2078  *      NET_RX_DROP     (packet was dropped)
2079  *
2080  */
2081
2082 int netif_rx(struct sk_buff *skb)
2083 {
2084         struct softnet_data *queue;
2085         unsigned long flags;
2086
2087         /* if netpoll wants it, pretend we never saw it */
2088         if (netpoll_rx(skb))
2089                 return NET_RX_DROP;
2090
2091         if (!skb->tstamp.tv64)
2092                 net_timestamp(skb);
2093
2094         /*
2095          * The code is rearranged so that the path is the most
2096          * short when CPU is congested, but is still operating.
2097          */
2098         local_irq_save(flags);
2099         queue = &__get_cpu_var(softnet_data);
2100
2101         __get_cpu_var(netdev_rx_stat).total++;
2102         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2103                 if (queue->input_pkt_queue.qlen) {
2104 enqueue:
2105                         __skb_queue_tail(&queue->input_pkt_queue, skb);
2106                         local_irq_restore(flags);
2107                         return NET_RX_SUCCESS;
2108                 }
2109
2110                 napi_schedule(&queue->backlog);
2111                 goto enqueue;
2112         }
2113
2114         __get_cpu_var(netdev_rx_stat).dropped++;
2115         local_irq_restore(flags);
2116
2117         kfree_skb(skb);
2118         return NET_RX_DROP;
2119 }
2120 EXPORT_SYMBOL(netif_rx);
2121
2122 int netif_rx_ni(struct sk_buff *skb)
2123 {
2124         int err;
2125
2126         preempt_disable();
2127         err = netif_rx(skb);
2128         if (local_softirq_pending())
2129                 do_softirq();
2130         preempt_enable();
2131
2132         return err;
2133 }
2134 EXPORT_SYMBOL(netif_rx_ni);
2135
2136 static void net_tx_action(struct softirq_action *h)
2137 {
2138         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2139
2140         if (sd->completion_queue) {
2141                 struct sk_buff *clist;
2142
2143                 local_irq_disable();
2144                 clist = sd->completion_queue;
2145                 sd->completion_queue = NULL;
2146                 local_irq_enable();
2147
2148                 while (clist) {
2149                         struct sk_buff *skb = clist;
2150                         clist = clist->next;
2151
2152                         WARN_ON(atomic_read(&skb->users));
2153                         __kfree_skb(skb);
2154                 }
2155         }
2156
2157         if (sd->output_queue) {
2158                 struct Qdisc *head;
2159
2160                 local_irq_disable();
2161                 head = sd->output_queue;
2162                 sd->output_queue = NULL;
2163                 local_irq_enable();
2164
2165                 while (head) {
2166                         struct Qdisc *q = head;
2167                         spinlock_t *root_lock;
2168
2169                         head = head->next_sched;
2170
2171                         root_lock = qdisc_lock(q);
2172                         if (spin_trylock(root_lock)) {
2173                                 smp_mb__before_clear_bit();
2174                                 clear_bit(__QDISC_STATE_SCHED,
2175                                           &q->state);
2176                                 qdisc_run(q);
2177                                 spin_unlock(root_lock);
2178                         } else {
2179                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2180                                               &q->state)) {
2181                                         __netif_reschedule(q);
2182                                 } else {
2183                                         smp_mb__before_clear_bit();
2184                                         clear_bit(__QDISC_STATE_SCHED,
2185                                                   &q->state);
2186                                 }
2187                         }
2188                 }
2189         }
2190 }
2191
2192 static inline int deliver_skb(struct sk_buff *skb,
2193                               struct packet_type *pt_prev,
2194                               struct net_device *orig_dev)
2195 {
2196         atomic_inc(&skb->users);
2197         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2198 }
2199
2200 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2201
2202 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2203 /* This hook is defined here for ATM LANE */
2204 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2205                              unsigned char *addr) __read_mostly;
2206 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2207 #endif
2208
2209 /*
2210  * If bridge module is loaded call bridging hook.
2211  *  returns NULL if packet was consumed.
2212  */
2213 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2214                                         struct sk_buff *skb) __read_mostly;
2215 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2216
2217 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2218                                             struct packet_type **pt_prev, int *ret,
2219                                             struct net_device *orig_dev)
2220 {
2221         struct net_bridge_port *port;
2222
2223         if (skb->pkt_type == PACKET_LOOPBACK ||
2224             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2225                 return skb;
2226
2227         if (*pt_prev) {
2228                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2229                 *pt_prev = NULL;
2230         }
2231
2232         return br_handle_frame_hook(port, skb);
2233 }
2234 #else
2235 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2236 #endif
2237
2238 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2239 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2240 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2241
2242 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2243                                              struct packet_type **pt_prev,
2244                                              int *ret,
2245                                              struct net_device *orig_dev)
2246 {
2247         if (skb->dev->macvlan_port == NULL)
2248                 return skb;
2249
2250         if (*pt_prev) {
2251                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2252                 *pt_prev = NULL;
2253         }
2254         return macvlan_handle_frame_hook(skb);
2255 }
2256 #else
2257 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2258 #endif
2259
2260 #ifdef CONFIG_NET_CLS_ACT
2261 /* TODO: Maybe we should just force sch_ingress to be compiled in
2262  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2263  * a compare and 2 stores extra right now if we dont have it on
2264  * but have CONFIG_NET_CLS_ACT
2265  * NOTE: This doesnt stop any functionality; if you dont have
2266  * the ingress scheduler, you just cant add policies on ingress.
2267  *
2268  */
2269 static int ing_filter(struct sk_buff *skb)
2270 {
2271         struct net_device *dev = skb->dev;
2272         u32 ttl = G_TC_RTTL(skb->tc_verd);
2273         struct netdev_queue *rxq;
2274         int result = TC_ACT_OK;
2275         struct Qdisc *q;
2276
2277         if (MAX_RED_LOOP < ttl++) {
2278                 printk(KERN_WARNING
2279                        "Redir loop detected Dropping packet (%d->%d)\n",
2280                        skb->iif, dev->ifindex);
2281                 return TC_ACT_SHOT;
2282         }
2283
2284         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2285         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2286
2287         rxq = &dev->rx_queue;
2288
2289         q = rxq->qdisc;
2290         if (q != &noop_qdisc) {
2291                 spin_lock(qdisc_lock(q));
2292                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2293                         result = qdisc_enqueue_root(skb, q);
2294                 spin_unlock(qdisc_lock(q));
2295         }
2296
2297         return result;
2298 }
2299
2300 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2301                                          struct packet_type **pt_prev,
2302                                          int *ret, struct net_device *orig_dev)
2303 {
2304         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2305                 goto out;
2306
2307         if (*pt_prev) {
2308                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2309                 *pt_prev = NULL;
2310         } else {
2311                 /* Huh? Why does turning on AF_PACKET affect this? */
2312                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2313         }
2314
2315         switch (ing_filter(skb)) {
2316         case TC_ACT_SHOT:
2317         case TC_ACT_STOLEN:
2318                 kfree_skb(skb);
2319                 return NULL;
2320         }
2321
2322 out:
2323         skb->tc_verd = 0;
2324         return skb;
2325 }
2326 #endif
2327
2328 /*
2329  *      netif_nit_deliver - deliver received packets to network taps
2330  *      @skb: buffer
2331  *
2332  *      This function is used to deliver incoming packets to network
2333  *      taps. It should be used when the normal netif_receive_skb path
2334  *      is bypassed, for example because of VLAN acceleration.
2335  */
2336 void netif_nit_deliver(struct sk_buff *skb)
2337 {
2338         struct packet_type *ptype;
2339
2340         if (list_empty(&ptype_all))
2341                 return;
2342
2343         skb_reset_network_header(skb);
2344         skb_reset_transport_header(skb);
2345         skb->mac_len = skb->network_header - skb->mac_header;
2346
2347         rcu_read_lock();
2348         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2349                 if (!ptype->dev || ptype->dev == skb->dev)
2350                         deliver_skb(skb, ptype, skb->dev);
2351         }
2352         rcu_read_unlock();
2353 }
2354
2355 /**
2356  *      netif_receive_skb - process receive buffer from network
2357  *      @skb: buffer to process
2358  *
2359  *      netif_receive_skb() is the main receive data processing function.
2360  *      It always succeeds. The buffer may be dropped during processing
2361  *      for congestion control or by the protocol layers.
2362  *
2363  *      This function may only be called from softirq context and interrupts
2364  *      should be enabled.
2365  *
2366  *      Return values (usually ignored):
2367  *      NET_RX_SUCCESS: no congestion
2368  *      NET_RX_DROP: packet was dropped
2369  */
2370 int netif_receive_skb(struct sk_buff *skb)
2371 {
2372         struct packet_type *ptype, *pt_prev;
2373         struct net_device *orig_dev;
2374         struct net_device *null_or_orig;
2375         int ret = NET_RX_DROP;
2376         __be16 type;
2377
2378         if (!skb->tstamp.tv64)
2379                 net_timestamp(skb);
2380
2381         if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2382                 return NET_RX_SUCCESS;
2383
2384         /* if we've gotten here through NAPI, check netpoll */
2385         if (netpoll_receive_skb(skb))
2386                 return NET_RX_DROP;
2387
2388         if (!skb->iif)
2389                 skb->iif = skb->dev->ifindex;
2390
2391         null_or_orig = NULL;
2392         orig_dev = skb->dev;
2393         if (orig_dev->master) {
2394                 if (skb_bond_should_drop(skb))
2395                         null_or_orig = orig_dev; /* deliver only exact match */
2396                 else
2397                         skb->dev = orig_dev->master;
2398         }
2399
2400         __get_cpu_var(netdev_rx_stat).total++;
2401
2402         skb_reset_network_header(skb);
2403         skb_reset_transport_header(skb);
2404         skb->mac_len = skb->network_header - skb->mac_header;
2405
2406         pt_prev = NULL;
2407
2408         rcu_read_lock();
2409
2410 #ifdef CONFIG_NET_CLS_ACT
2411         if (skb->tc_verd & TC_NCLS) {
2412                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2413                 goto ncls;
2414         }
2415 #endif
2416
2417         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2418                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2419                     ptype->dev == orig_dev) {
2420                         if (pt_prev)
2421                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2422                         pt_prev = ptype;
2423                 }
2424         }
2425
2426 #ifdef CONFIG_NET_CLS_ACT
2427         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2428         if (!skb)
2429                 goto out;
2430 ncls:
2431 #endif
2432
2433         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2434         if (!skb)
2435                 goto out;
2436         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2437         if (!skb)
2438                 goto out;
2439
2440         type = skb->protocol;
2441         list_for_each_entry_rcu(ptype,
2442                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2443                 if (ptype->type == type &&
2444                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2445                      ptype->dev == orig_dev)) {
2446                         if (pt_prev)
2447                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2448                         pt_prev = ptype;
2449                 }
2450         }
2451
2452         if (pt_prev) {
2453                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2454         } else {
2455                 kfree_skb(skb);
2456                 /* Jamal, now you will not able to escape explaining
2457                  * me how you were going to use this. :-)
2458                  */
2459                 ret = NET_RX_DROP;
2460         }
2461
2462 out:
2463         rcu_read_unlock();
2464         return ret;
2465 }
2466 EXPORT_SYMBOL(netif_receive_skb);
2467
2468 /* Network device is going away, flush any packets still pending  */
2469 static void flush_backlog(void *arg)
2470 {
2471         struct net_device *dev = arg;
2472         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2473         struct sk_buff *skb, *tmp;
2474
2475         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2476                 if (skb->dev == dev) {
2477                         __skb_unlink(skb, &queue->input_pkt_queue);
2478                         kfree_skb(skb);
2479                 }
2480 }
2481
2482 static int napi_gro_complete(struct sk_buff *skb)
2483 {
2484         struct packet_type *ptype;
2485         __be16 type = skb->protocol;
2486         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2487         int err = -ENOENT;
2488
2489         if (NAPI_GRO_CB(skb)->count == 1) {
2490                 skb_shinfo(skb)->gso_size = 0;
2491                 goto out;
2492         }
2493
2494         rcu_read_lock();
2495         list_for_each_entry_rcu(ptype, head, list) {
2496                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2497                         continue;
2498
2499                 err = ptype->gro_complete(skb);
2500                 break;
2501         }
2502         rcu_read_unlock();
2503
2504         if (err) {
2505                 WARN_ON(&ptype->list == head);
2506                 kfree_skb(skb);
2507                 return NET_RX_SUCCESS;
2508         }
2509
2510 out:
2511         return netif_receive_skb(skb);
2512 }
2513
2514 void napi_gro_flush(struct napi_struct *napi)
2515 {
2516         struct sk_buff *skb, *next;
2517
2518         for (skb = napi->gro_list; skb; skb = next) {
2519                 next = skb->next;
2520                 skb->next = NULL;
2521                 napi_gro_complete(skb);
2522         }
2523
2524         napi->gro_count = 0;
2525         napi->gro_list = NULL;
2526 }
2527 EXPORT_SYMBOL(napi_gro_flush);
2528
2529 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2530 {
2531         struct sk_buff **pp = NULL;
2532         struct packet_type *ptype;
2533         __be16 type = skb->protocol;
2534         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2535         int same_flow;
2536         int mac_len;
2537         enum gro_result ret;
2538
2539         if (!(skb->dev->features & NETIF_F_GRO))
2540                 goto normal;
2541
2542         if (skb_is_gso(skb) || skb_has_frags(skb))
2543                 goto normal;
2544
2545         rcu_read_lock();
2546         list_for_each_entry_rcu(ptype, head, list) {
2547                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2548                         continue;
2549
2550                 skb_set_network_header(skb, skb_gro_offset(skb));
2551                 mac_len = skb->network_header - skb->mac_header;
2552                 skb->mac_len = mac_len;
2553                 NAPI_GRO_CB(skb)->same_flow = 0;
2554                 NAPI_GRO_CB(skb)->flush = 0;
2555                 NAPI_GRO_CB(skb)->free = 0;
2556
2557                 pp = ptype->gro_receive(&napi->gro_list, skb);
2558                 break;
2559         }
2560         rcu_read_unlock();
2561
2562         if (&ptype->list == head)
2563                 goto normal;
2564
2565         same_flow = NAPI_GRO_CB(skb)->same_flow;
2566         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2567
2568         if (pp) {
2569                 struct sk_buff *nskb = *pp;
2570
2571                 *pp = nskb->next;
2572                 nskb->next = NULL;
2573                 napi_gro_complete(nskb);
2574                 napi->gro_count--;
2575         }
2576
2577         if (same_flow)
2578                 goto ok;
2579
2580         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2581                 goto normal;
2582
2583         napi->gro_count++;
2584         NAPI_GRO_CB(skb)->count = 1;
2585         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2586         skb->next = napi->gro_list;
2587         napi->gro_list = skb;
2588         ret = GRO_HELD;
2589
2590 pull:
2591         if (skb_headlen(skb) < skb_gro_offset(skb)) {
2592                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2593
2594                 BUG_ON(skb->end - skb->tail < grow);
2595
2596                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2597
2598                 skb->tail += grow;
2599                 skb->data_len -= grow;
2600
2601                 skb_shinfo(skb)->frags[0].page_offset += grow;
2602                 skb_shinfo(skb)->frags[0].size -= grow;
2603
2604                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2605                         put_page(skb_shinfo(skb)->frags[0].page);
2606                         memmove(skb_shinfo(skb)->frags,
2607                                 skb_shinfo(skb)->frags + 1,
2608                                 --skb_shinfo(skb)->nr_frags);
2609                 }
2610         }
2611
2612 ok:
2613         return ret;
2614
2615 normal:
2616         ret = GRO_NORMAL;
2617         goto pull;
2618 }
2619 EXPORT_SYMBOL(dev_gro_receive);
2620
2621 static gro_result_t
2622 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2623 {
2624         struct sk_buff *p;
2625
2626         if (netpoll_rx_on(skb))
2627                 return GRO_NORMAL;
2628
2629         for (p = napi->gro_list; p; p = p->next) {
2630                 NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2631                         && !compare_ether_header(skb_mac_header(p),
2632                                                  skb_gro_mac_header(skb));
2633                 NAPI_GRO_CB(p)->flush = 0;
2634         }
2635
2636         return dev_gro_receive(napi, skb);
2637 }
2638
2639 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
2640 {
2641         switch (ret) {
2642         case GRO_NORMAL:
2643                 if (netif_receive_skb(skb))
2644                         ret = GRO_DROP;
2645                 break;
2646
2647         case GRO_DROP:
2648         case GRO_MERGED_FREE:
2649                 kfree_skb(skb);
2650                 break;
2651
2652         case GRO_HELD:
2653         case GRO_MERGED:
2654                 break;
2655         }
2656
2657         return ret;
2658 }
2659 EXPORT_SYMBOL(napi_skb_finish);
2660
2661 void skb_gro_reset_offset(struct sk_buff *skb)
2662 {
2663         NAPI_GRO_CB(skb)->data_offset = 0;
2664         NAPI_GRO_CB(skb)->frag0 = NULL;
2665         NAPI_GRO_CB(skb)->frag0_len = 0;
2666
2667         if (skb->mac_header == skb->tail &&
2668             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2669                 NAPI_GRO_CB(skb)->frag0 =
2670                         page_address(skb_shinfo(skb)->frags[0].page) +
2671                         skb_shinfo(skb)->frags[0].page_offset;
2672                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2673         }
2674 }
2675 EXPORT_SYMBOL(skb_gro_reset_offset);
2676
2677 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2678 {
2679         skb_gro_reset_offset(skb);
2680
2681         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2682 }
2683 EXPORT_SYMBOL(napi_gro_receive);
2684
2685 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2686 {
2687         __skb_pull(skb, skb_headlen(skb));
2688         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2689
2690         napi->skb = skb;
2691 }
2692 EXPORT_SYMBOL(napi_reuse_skb);
2693
2694 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2695 {
2696         struct sk_buff *skb = napi->skb;
2697
2698         if (!skb) {
2699                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
2700                 if (skb)
2701                         napi->skb = skb;
2702         }
2703         return skb;
2704 }
2705 EXPORT_SYMBOL(napi_get_frags);
2706
2707 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2708                                gro_result_t ret)
2709 {
2710         switch (ret) {
2711         case GRO_NORMAL:
2712         case GRO_HELD:
2713                 skb->protocol = eth_type_trans(skb, napi->dev);
2714
2715                 if (ret == GRO_HELD)
2716                         skb_gro_pull(skb, -ETH_HLEN);
2717                 else if (netif_receive_skb(skb))
2718                         ret = GRO_DROP;
2719                 break;
2720
2721         case GRO_DROP:
2722         case GRO_MERGED_FREE:
2723                 napi_reuse_skb(napi, skb);
2724                 break;
2725
2726         case GRO_MERGED:
2727                 break;
2728         }
2729
2730         return ret;
2731 }
2732 EXPORT_SYMBOL(napi_frags_finish);
2733
2734 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2735 {
2736         struct sk_buff *skb = napi->skb;
2737         struct ethhdr *eth;
2738         unsigned int hlen;
2739         unsigned int off;
2740
2741         napi->skb = NULL;
2742
2743         skb_reset_mac_header(skb);
2744         skb_gro_reset_offset(skb);
2745
2746         off = skb_gro_offset(skb);
2747         hlen = off + sizeof(*eth);
2748         eth = skb_gro_header_fast(skb, off);
2749         if (skb_gro_header_hard(skb, hlen)) {
2750                 eth = skb_gro_header_slow(skb, hlen, off);
2751                 if (unlikely(!eth)) {
2752                         napi_reuse_skb(napi, skb);
2753                         skb = NULL;
2754                         goto out;
2755                 }
2756         }
2757
2758         skb_gro_pull(skb, sizeof(*eth));
2759
2760         /*
2761          * This works because the only protocols we care about don't require
2762          * special handling.  We'll fix it up properly at the end.
2763          */
2764         skb->protocol = eth->h_proto;
2765
2766 out:
2767         return skb;
2768 }
2769 EXPORT_SYMBOL(napi_frags_skb);
2770
2771 gro_result_t napi_gro_frags(struct napi_struct *napi)
2772 {
2773         struct sk_buff *skb = napi_frags_skb(napi);
2774
2775         if (!skb)
2776                 return GRO_DROP;
2777
2778         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2779 }
2780 EXPORT_SYMBOL(napi_gro_frags);
2781
2782 static int process_backlog(struct napi_struct *napi, int quota)
2783 {
2784         int work = 0;
2785         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2786         unsigned long start_time = jiffies;
2787
2788         napi->weight = weight_p;
2789         do {
2790                 struct sk_buff *skb;
2791
2792                 local_irq_disable();
2793                 skb = __skb_dequeue(&queue->input_pkt_queue);
2794                 if (!skb) {
2795                         __napi_complete(napi);
2796                         local_irq_enable();
2797                         break;
2798                 }
2799                 local_irq_enable();
2800
2801                 netif_receive_skb(skb);
2802         } while (++work < quota && jiffies == start_time);
2803
2804         return work;
2805 }
2806
2807 /**
2808  * __napi_schedule - schedule for receive
2809  * @n: entry to schedule
2810  *
2811  * The entry's receive function will be scheduled to run
2812  */
2813 void __napi_schedule(struct napi_struct *n)
2814 {
2815         unsigned long flags;
2816
2817         local_irq_save(flags);
2818         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2819         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2820         local_irq_restore(flags);
2821 }
2822 EXPORT_SYMBOL(__napi_schedule);
2823
2824 void __napi_complete(struct napi_struct *n)
2825 {
2826         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2827         BUG_ON(n->gro_list);
2828
2829         list_del(&n->poll_list);
2830         smp_mb__before_clear_bit();
2831         clear_bit(NAPI_STATE_SCHED, &n->state);
2832 }
2833 EXPORT_SYMBOL(__napi_complete);
2834
2835 void napi_complete(struct napi_struct *n)
2836 {
2837         unsigned long flags;
2838
2839         /*
2840          * don't let napi dequeue from the cpu poll list
2841          * just in case its running on a different cpu
2842          */
2843         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2844                 return;
2845
2846         napi_gro_flush(n);
2847         local_irq_save(flags);
2848         __napi_complete(n);
2849         local_irq_restore(flags);
2850 }
2851 EXPORT_SYMBOL(napi_complete);
2852
2853 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2854                     int (*poll)(struct napi_struct *, int), int weight)
2855 {
2856         INIT_LIST_HEAD(&napi->poll_list);
2857         napi->gro_count = 0;
2858         napi->gro_list = NULL;
2859         napi->skb = NULL;
2860         napi->poll = poll;
2861         napi->weight = weight;
2862         list_add(&napi->dev_list, &dev->napi_list);
2863         napi->dev = dev;
2864 #ifdef CONFIG_NETPOLL
2865         spin_lock_init(&napi->poll_lock);
2866         napi->poll_owner = -1;
2867 #endif
2868         set_bit(NAPI_STATE_SCHED, &napi->state);
2869 }
2870 EXPORT_SYMBOL(netif_napi_add);
2871
2872 void netif_napi_del(struct napi_struct *napi)
2873 {
2874         struct sk_buff *skb, *next;
2875
2876         list_del_init(&napi->dev_list);
2877         napi_free_frags(napi);
2878
2879         for (skb = napi->gro_list; skb; skb = next) {
2880                 next = skb->next;
2881                 skb->next = NULL;
2882                 kfree_skb(skb);
2883         }
2884
2885         napi->gro_list = NULL;
2886         napi->gro_count = 0;
2887 }
2888 EXPORT_SYMBOL(netif_napi_del);
2889
2890
2891 static void net_rx_action(struct softirq_action *h)
2892 {
2893         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2894         unsigned long time_limit = jiffies + 2;
2895         int budget = netdev_budget;
2896         void *have;
2897
2898         local_irq_disable();
2899
2900         while (!list_empty(list)) {
2901                 struct napi_struct *n;
2902                 int work, weight;
2903
2904                 /* If softirq window is exhuasted then punt.
2905                  * Allow this to run for 2 jiffies since which will allow
2906                  * an average latency of 1.5/HZ.
2907                  */
2908                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2909                         goto softnet_break;
2910
2911                 local_irq_enable();
2912
2913                 /* Even though interrupts have been re-enabled, this
2914                  * access is safe because interrupts can only add new
2915                  * entries to the tail of this list, and only ->poll()
2916                  * calls can remove this head entry from the list.
2917                  */
2918                 n = list_entry(list->next, struct napi_struct, poll_list);
2919
2920                 have = netpoll_poll_lock(n);
2921
2922                 weight = n->weight;
2923
2924                 /* This NAPI_STATE_SCHED test is for avoiding a race
2925                  * with netpoll's poll_napi().  Only the entity which
2926                  * obtains the lock and sees NAPI_STATE_SCHED set will
2927                  * actually make the ->poll() call.  Therefore we avoid
2928                  * accidently calling ->poll() when NAPI is not scheduled.
2929                  */
2930                 work = 0;
2931                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
2932                         work = n->poll(n, weight);
2933                         trace_napi_poll(n);
2934                 }
2935
2936                 WARN_ON_ONCE(work > weight);
2937
2938                 budget -= work;
2939
2940                 local_irq_disable();
2941
2942                 /* Drivers must not modify the NAPI state if they
2943                  * consume the entire weight.  In such cases this code
2944                  * still "owns" the NAPI instance and therefore can
2945                  * move the instance around on the list at-will.
2946                  */
2947                 if (unlikely(work == weight)) {
2948                         if (unlikely(napi_disable_pending(n))) {
2949                                 local_irq_enable();
2950                                 napi_complete(n);
2951                                 local_irq_disable();
2952                         } else
2953                                 list_move_tail(&n->poll_list, list);
2954                 }
2955
2956                 netpoll_poll_unlock(have);
2957         }
2958 out:
2959         local_irq_enable();
2960
2961 #ifdef CONFIG_NET_DMA
2962         /*
2963          * There may not be any more sk_buffs coming right now, so push
2964          * any pending DMA copies to hardware
2965          */
2966         dma_issue_pending_all();
2967 #endif
2968
2969         return;
2970
2971 softnet_break:
2972         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2973         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2974         goto out;
2975 }
2976
2977 static gifconf_func_t *gifconf_list[NPROTO];
2978
2979 /**
2980  *      register_gifconf        -       register a SIOCGIF handler
2981  *      @family: Address family
2982  *      @gifconf: Function handler
2983  *
2984  *      Register protocol dependent address dumping routines. The handler
2985  *      that is passed must not be freed or reused until it has been replaced
2986  *      by another handler.
2987  */
2988 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
2989 {
2990         if (family >= NPROTO)
2991                 return -EINVAL;
2992         gifconf_list[family] = gifconf;
2993         return 0;
2994 }
2995 EXPORT_SYMBOL(register_gifconf);
2996
2997
2998 /*
2999  *      Map an interface index to its name (SIOCGIFNAME)
3000  */
3001
3002 /*
3003  *      We need this ioctl for efficient implementation of the
3004  *      if_indextoname() function required by the IPv6 API.  Without
3005  *      it, we would have to search all the interfaces to find a
3006  *      match.  --pb
3007  */
3008
3009 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3010 {
3011         struct net_device *dev;
3012         struct ifreq ifr;
3013
3014         /*
3015          *      Fetch the caller's info block.
3016          */
3017
3018         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3019                 return -EFAULT;
3020
3021         rcu_read_lock();
3022         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3023         if (!dev) {
3024                 rcu_read_unlock();
3025                 return -ENODEV;
3026         }
3027
3028         strcpy(ifr.ifr_name, dev->name);
3029         rcu_read_unlock();
3030
3031         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3032                 return -EFAULT;
3033         return 0;
3034 }
3035
3036 /*
3037  *      Perform a SIOCGIFCONF call. This structure will change
3038  *      size eventually, and there is nothing I can do about it.
3039  *      Thus we will need a 'compatibility mode'.
3040  */
3041
3042 static int dev_ifconf(struct net *net, char __user *arg)
3043 {
3044         struct ifconf ifc;
3045         struct net_device *dev;
3046         char __user *pos;
3047         int len;
3048         int total;
3049         int i;
3050
3051         /*
3052          *      Fetch the caller's info block.
3053          */
3054
3055         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3056                 return -EFAULT;
3057
3058         pos = ifc.ifc_buf;
3059         len = ifc.ifc_len;
3060
3061         /*
3062          *      Loop over the interfaces, and write an info block for each.
3063          */
3064
3065         total = 0;
3066         for_each_netdev(net, dev) {
3067                 for (i = 0; i < NPROTO; i++) {
3068                         if (gifconf_list[i]) {
3069                                 int done;
3070                                 if (!pos)
3071                                         done = gifconf_list[i](dev, NULL, 0);
3072                                 else
3073                                         done = gifconf_list[i](dev, pos + total,
3074                                                                len - total);
3075                                 if (done < 0)
3076                                         return -EFAULT;
3077                                 total += done;
3078                         }
3079                 }
3080         }
3081
3082         /*
3083          *      All done.  Write the updated control block back to the caller.
3084          */
3085         ifc.ifc_len = total;
3086
3087         /*
3088          *      Both BSD and Solaris return 0 here, so we do too.
3089          */
3090         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3091 }
3092
3093 #ifdef CONFIG_PROC_FS
3094 /*
3095  *      This is invoked by the /proc filesystem handler to display a device
3096  *      in detail.
3097  */
3098 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3099         __acquires(RCU)
3100 {
3101         struct net *net = seq_file_net(seq);
3102         loff_t off;
3103         struct net_device *dev;
3104
3105         rcu_read_lock();
3106         if (!*pos)
3107                 return SEQ_START_TOKEN;
3108
3109         off = 1;
3110         for_each_netdev_rcu(net, dev)
3111                 if (off++ == *pos)
3112                         return dev;
3113
3114         return NULL;
3115 }
3116
3117 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3118 {
3119         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3120                                   first_net_device(seq_file_net(seq)) :
3121                                   next_net_device((struct net_device *)v);
3122
3123         ++*pos;
3124         return rcu_dereference(dev);
3125 }
3126
3127 void dev_seq_stop(struct seq_file *seq, void *v)
3128         __releases(RCU)
3129 {
3130         rcu_read_unlock();
3131 }
3132
3133 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3134 {
3135         const struct net_device_stats *stats = dev_get_stats(dev);
3136
3137         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3138                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3139                    dev->name, stats->rx_bytes, stats->rx_packets,
3140                    stats->rx_errors,
3141                    stats->rx_dropped + stats->rx_missed_errors,
3142                    stats->rx_fifo_errors,
3143                    stats->rx_length_errors + stats->rx_over_errors +
3144                     stats->rx_crc_errors + stats->rx_frame_errors,
3145                    stats->rx_compressed, stats->multicast,
3146                    stats->tx_bytes, stats->tx_packets,
3147                    stats->tx_errors, stats->tx_dropped,
3148                    stats->tx_fifo_errors, stats->collisions,
3149                    stats->tx_carrier_errors +
3150                     stats->tx_aborted_errors +
3151                     stats->tx_window_errors +
3152                     stats->tx_heartbeat_errors,
3153                    stats->tx_compressed);
3154 }
3155
3156 /*
3157  *      Called from the PROCfs module. This now uses the new arbitrary sized
3158  *      /proc/net interface to create /proc/net/dev
3159  */
3160 static int dev_seq_show(struct seq_file *seq, void *v)
3161 {
3162         if (v == SEQ_START_TOKEN)
3163                 seq_puts(seq, "Inter-|   Receive                            "
3164                               "                    |  Transmit\n"
3165                               " face |bytes    packets errs drop fifo frame "
3166                               "compressed multicast|bytes    packets errs "
3167                               "drop fifo colls carrier compressed\n");
3168         else
3169                 dev_seq_printf_stats(seq, v);
3170         return 0;
3171 }
3172
3173 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3174 {
3175         struct netif_rx_stats *rc = NULL;
3176
3177         while (*pos < nr_cpu_ids)
3178                 if (cpu_online(*pos)) {
3179                         rc = &per_cpu(netdev_rx_stat, *pos);
3180                         break;
3181                 } else
3182                         ++*pos;
3183         return rc;
3184 }
3185
3186 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3187 {
3188         return softnet_get_online(pos);
3189 }
3190
3191 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3192 {
3193         ++*pos;
3194         return softnet_get_online(pos);
3195 }
3196
3197 static void softnet_seq_stop(struct seq_file *seq, void *v)
3198 {
3199 }
3200
3201 static int softnet_seq_show(struct seq_file *seq, void *v)
3202 {
3203         struct netif_rx_stats *s = v;
3204
3205         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3206                    s->total, s->dropped, s->time_squeeze, 0,
3207                    0, 0, 0, 0, /* was fastroute */
3208                    s->cpu_collision);
3209         return 0;
3210 }
3211
3212 static const struct seq_operations dev_seq_ops = {
3213         .start = dev_seq_start,
3214         .next  = dev_seq_next,
3215         .stop  = dev_seq_stop,
3216         .show  = dev_seq_show,
3217 };
3218
3219 static int dev_seq_open(struct inode *inode, struct file *file)
3220 {
3221         return seq_open_net(inode, file, &dev_seq_ops,
3222                             sizeof(struct seq_net_private));
3223 }
3224
3225 static const struct file_operations dev_seq_fops = {
3226         .owner   = THIS_MODULE,
3227         .open    = dev_seq_open,
3228         .read    = seq_read,
3229         .llseek  = seq_lseek,
3230         .release = seq_release_net,
3231 };
3232
3233 static const struct seq_operations softnet_seq_ops = {
3234         .start = softnet_seq_start,
3235         .next  = softnet_seq_next,
3236         .stop  = softnet_seq_stop,
3237         .show  = softnet_seq_show,
3238 };
3239
3240 static int softnet_seq_open(struct inode *inode, struct file *file)
3241 {
3242         return seq_open(file, &softnet_seq_ops);
3243 }
3244
3245 static const struct file_operations softnet_seq_fops = {
3246         .owner   = THIS_MODULE,
3247         .open    = softnet_seq_open,
3248         .read    = seq_read,
3249         .llseek  = seq_lseek,
3250         .release = seq_release,
3251 };
3252
3253 static void *ptype_get_idx(loff_t pos)
3254 {
3255         struct packet_type *pt = NULL;
3256         loff_t i = 0;
3257         int t;
3258
3259         list_for_each_entry_rcu(pt, &ptype_all, list) {
3260                 if (i == pos)
3261                         return pt;
3262                 ++i;
3263         }
3264
3265         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3266                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3267                         if (i == pos)
3268                                 return pt;
3269                         ++i;
3270                 }
3271         }
3272         return NULL;
3273 }
3274
3275 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3276         __acquires(RCU)
3277 {
3278         rcu_read_lock();
3279         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3280 }
3281
3282 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3283 {
3284         struct packet_type *pt;
3285         struct list_head *nxt;
3286         int hash;
3287
3288         ++*pos;
3289         if (v == SEQ_START_TOKEN)
3290                 return ptype_get_idx(0);
3291
3292         pt = v;
3293         nxt = pt->list.next;
3294         if (pt->type == htons(ETH_P_ALL)) {
3295                 if (nxt != &ptype_all)
3296                         goto found;
3297                 hash = 0;
3298                 nxt = ptype_base[0].next;
3299         } else
3300                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3301
3302         while (nxt == &ptype_base[hash]) {
3303                 if (++hash >= PTYPE_HASH_SIZE)
3304                         return NULL;
3305                 nxt = ptype_base[hash].next;
3306         }
3307 found:
3308         return list_entry(nxt, struct packet_type, list);
3309 }
3310
3311 static void ptype_seq_stop(struct seq_file *seq, void *v)
3312         __releases(RCU)
3313 {
3314         rcu_read_unlock();
3315 }
3316
3317 static int ptype_seq_show(struct seq_file *seq, void *v)
3318 {
3319         struct packet_type *pt = v;
3320
3321         if (v == SEQ_START_TOKEN)
3322                 seq_puts(seq, "Type Device      Function\n");
3323         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3324                 if (pt->type == htons(ETH_P_ALL))
3325                         seq_puts(seq, "ALL ");
3326                 else
3327                         seq_printf(seq, "%04x", ntohs(pt->type));
3328
3329                 seq_printf(seq, " %-8s %pF\n",
3330                            pt->dev ? pt->dev->name : "", pt->func);
3331         }
3332
3333         return 0;
3334 }
3335
3336 static const struct seq_operations ptype_seq_ops = {
3337         .start = ptype_seq_start,
3338         .next  = ptype_seq_next,
3339         .stop  = ptype_seq_stop,
3340         .show  = ptype_seq_show,
3341 };
3342
3343 static int ptype_seq_open(struct inode *inode, struct file *file)
3344 {
3345         return seq_open_net(inode, file, &ptype_seq_ops,
3346                         sizeof(struct seq_net_private));
3347 }
3348
3349 static const struct file_operations ptype_seq_fops = {
3350         .owner   = THIS_MODULE,
3351         .open    = ptype_seq_open,
3352         .read    = seq_read,
3353         .llseek  = seq_lseek,
3354         .release = seq_release_net,
3355 };
3356
3357
3358 static int __net_init dev_proc_net_init(struct net *net)
3359 {
3360         int rc = -ENOMEM;
3361
3362         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3363                 goto out;
3364         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3365                 goto out_dev;
3366         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3367                 goto out_softnet;
3368
3369         if (wext_proc_init(net))
3370                 goto out_ptype;
3371         rc = 0;
3372 out:
3373         return rc;
3374 out_ptype:
3375         proc_net_remove(net, "ptype");
3376 out_softnet:
3377         proc_net_remove(net, "softnet_stat");
3378 out_dev:
3379         proc_net_remove(net, "dev");
3380         goto out;
3381 }
3382
3383 static void __net_exit dev_proc_net_exit(struct net *net)
3384 {
3385         wext_proc_exit(net);
3386
3387         proc_net_remove(net, "ptype");
3388         proc_net_remove(net, "softnet_stat");
3389         proc_net_remove(net, "dev");
3390 }
3391
3392 static struct pernet_operations __net_initdata dev_proc_ops = {
3393         .init = dev_proc_net_init,
3394         .exit = dev_proc_net_exit,
3395 };
3396
3397 static int __init dev_proc_init(void)
3398 {
3399         return register_pernet_subsys(&dev_proc_ops);
3400 }
3401 #else
3402 #define dev_proc_init() 0
3403 #endif  /* CONFIG_PROC_FS */
3404
3405
3406 /**
3407  *      netdev_set_master       -       set up master/slave pair
3408  *      @slave: slave device
3409  *      @master: new master device
3410  *
3411  *      Changes the master device of the slave. Pass %NULL to break the
3412  *      bonding. The caller must hold the RTNL semaphore. On a failure
3413  *      a negative errno code is returned. On success the reference counts
3414  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3415  *      function returns zero.
3416  */
3417 int netdev_set_master(struct net_device *slave, struct net_device *master)
3418 {
3419         struct net_device *old = slave->master;
3420
3421         ASSERT_RTNL();
3422
3423         if (master) {
3424                 if (old)
3425                         return -EBUSY;
3426                 dev_hold(master);
3427         }
3428
3429         slave->master = master;
3430
3431         synchronize_net();
3432
3433         if (old)
3434                 dev_put(old);
3435
3436         if (master)
3437                 slave->flags |= IFF_SLAVE;
3438         else
3439                 slave->flags &= ~IFF_SLAVE;
3440
3441         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3442         return 0;
3443 }
3444 EXPORT_SYMBOL(netdev_set_master);
3445
3446 static void dev_change_rx_flags(struct net_device *dev, int flags)
3447 {
3448         const struct net_device_ops *ops = dev->netdev_ops;
3449
3450         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3451                 ops->ndo_change_rx_flags(dev, flags);
3452 }
3453
3454 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3455 {
3456         unsigned short old_flags = dev->flags;
3457         uid_t uid;
3458         gid_t gid;
3459
3460         ASSERT_RTNL();
3461
3462         dev->flags |= IFF_PROMISC;
3463         dev->promiscuity += inc;
3464         if (dev->promiscuity == 0) {
3465                 /*
3466                  * Avoid overflow.
3467                  * If inc causes overflow, untouch promisc and return error.
3468                  */
3469                 if (inc < 0)
3470                         dev->flags &= ~IFF_PROMISC;
3471                 else {
3472                         dev->promiscuity -= inc;
3473                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3474                                 "set promiscuity failed, promiscuity feature "
3475                                 "of device might be broken.\n", dev->name);
3476                         return -EOVERFLOW;
3477                 }
3478         }
3479         if (dev->flags != old_flags) {
3480                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3481                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3482                                                                "left");
3483                 if (audit_enabled) {
3484                         current_uid_gid(&uid, &gid);
3485                         audit_log(current->audit_context, GFP_ATOMIC,
3486                                 AUDIT_ANOM_PROMISCUOUS,
3487                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3488                                 dev->name, (dev->flags & IFF_PROMISC),
3489                                 (old_flags & IFF_PROMISC),
3490                                 audit_get_loginuid(current),
3491                                 uid, gid,
3492                                 audit_get_sessionid(current));
3493                 }
3494
3495                 dev_change_rx_flags(dev, IFF_PROMISC);
3496         }
3497         return 0;
3498 }
3499
3500 /**
3501  *      dev_set_promiscuity     - update promiscuity count on a device
3502  *      @dev: device
3503  *      @inc: modifier
3504  *
3505  *      Add or remove promiscuity from a device. While the count in the device
3506  *      remains above zero the interface remains promiscuous. Once it hits zero
3507  *      the device reverts back to normal filtering operation. A negative inc
3508  *      value is used to drop promiscuity on the device.
3509  *      Return 0 if successful or a negative errno code on error.
3510  */
3511 int dev_set_promiscuity(struct net_device *dev, int inc)
3512 {
3513         unsigned short old_flags = dev->flags;
3514         int err;
3515
3516         err = __dev_set_promiscuity(dev, inc);
3517         if (err < 0)
3518                 return err;
3519         if (dev->flags != old_flags)
3520                 dev_set_rx_mode(dev);
3521         return err;
3522 }
3523 EXPORT_SYMBOL(dev_set_promiscuity);
3524
3525 /**
3526  *      dev_set_allmulti        - update allmulti count on a device
3527  *      @dev: device
3528  *      @inc: modifier
3529  *
3530  *      Add or remove reception of all multicast frames to a device. While the
3531  *      count in the device remains above zero the interface remains listening
3532  *      to all interfaces. Once it hits zero the device reverts back to normal
3533  *      filtering operation. A negative @inc value is used to drop the counter
3534  *      when releasing a resource needing all multicasts.
3535  *      Return 0 if successful or a negative errno code on error.
3536  */
3537
3538 int dev_set_allmulti(struct net_device *dev, int inc)
3539 {
3540         unsigned short old_flags = dev->flags;
3541
3542         ASSERT_RTNL();
3543
3544         dev->flags |= IFF_ALLMULTI;
3545         dev->allmulti += inc;
3546         if (dev->allmulti == 0) {
3547                 /*
3548                  * Avoid overflow.
3549                  * If inc causes overflow, untouch allmulti and return error.
3550                  */
3551                 if (inc < 0)
3552                         dev->flags &= ~IFF_ALLMULTI;
3553                 else {
3554                         dev->allmulti -= inc;
3555                         printk(KERN_WARNING "%s: allmulti touches roof, "
3556                                 "set allmulti failed, allmulti feature of "
3557                                 "device might be broken.\n", dev->name);
3558                         return -EOVERFLOW;
3559                 }
3560         }
3561         if (dev->flags ^ old_flags) {
3562                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3563                 dev_set_rx_mode(dev);
3564         }
3565         return 0;
3566 }
3567 EXPORT_SYMBOL(dev_set_allmulti);
3568
3569 /*
3570  *      Upload unicast and multicast address lists to device and
3571  *      configure RX filtering. When the device doesn't support unicast
3572  *      filtering it is put in promiscuous mode while unicast addresses
3573  *      are present.
3574  */
3575 void __dev_set_rx_mode(struct net_device *dev)
3576 {
3577         const struct net_device_ops *ops = dev->netdev_ops;
3578
3579         /* dev_open will call this function so the list will stay sane. */
3580         if (!(dev->flags&IFF_UP))
3581                 return;
3582
3583         if (!netif_device_present(dev))
3584                 return;
3585
3586         if (ops->ndo_set_rx_mode)
3587                 ops->ndo_set_rx_mode(dev);
3588         else {
3589                 /* Unicast addresses changes may only happen under the rtnl,
3590                  * therefore calling __dev_set_promiscuity here is safe.
3591                  */
3592                 if (dev->uc.count > 0 && !dev->uc_promisc) {
3593                         __dev_set_promiscuity(dev, 1);
3594                         dev->uc_promisc = 1;
3595                 } else if (dev->uc.count == 0 && dev->uc_promisc) {
3596                         __dev_set_promiscuity(dev, -1);
3597                         dev->uc_promisc = 0;
3598                 }
3599
3600                 if (ops->ndo_set_multicast_list)
3601                         ops->ndo_set_multicast_list(dev);
3602         }
3603 }
3604
3605 void dev_set_rx_mode(struct net_device *dev)
3606 {
3607         netif_addr_lock_bh(dev);
3608         __dev_set_rx_mode(dev);
3609         netif_addr_unlock_bh(dev);
3610 }
3611
3612 /* hw addresses list handling functions */
3613
3614 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3615                          int addr_len, unsigned char addr_type)
3616 {
3617         struct netdev_hw_addr *ha;
3618         int alloc_size;
3619
3620         if (addr_len > MAX_ADDR_LEN)
3621                 return -EINVAL;
3622
3623         list_for_each_entry(ha, &list->list, list) {
3624                 if (!memcmp(ha->addr, addr, addr_len) &&
3625                     ha->type == addr_type) {
3626                         ha->refcount++;
3627                         return 0;
3628                 }
3629         }
3630
3631
3632         alloc_size = sizeof(*ha);
3633         if (alloc_size < L1_CACHE_BYTES)
3634                 alloc_size = L1_CACHE_BYTES;
3635         ha = kmalloc(alloc_size, GFP_ATOMIC);
3636         if (!ha)
3637                 return -ENOMEM;
3638         memcpy(ha->addr, addr, addr_len);
3639         ha->type = addr_type;
3640         ha->refcount = 1;
3641         ha->synced = false;
3642         list_add_tail_rcu(&ha->list, &list->list);
3643         list->count++;
3644         return 0;
3645 }
3646
3647 static void ha_rcu_free(struct rcu_head *head)
3648 {
3649         struct netdev_hw_addr *ha;
3650
3651         ha = container_of(head, struct netdev_hw_addr, rcu_head);
3652         kfree(ha);
3653 }
3654
3655 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3656                          int addr_len, unsigned char addr_type)
3657 {
3658         struct netdev_hw_addr *ha;
3659
3660         list_for_each_entry(ha, &list->list, list) {
3661                 if (!memcmp(ha->addr, addr, addr_len) &&
3662                     (ha->type == addr_type || !addr_type)) {
3663                         if (--ha->refcount)
3664                                 return 0;
3665                         list_del_rcu(&ha->list);
3666                         call_rcu(&ha->rcu_head, ha_rcu_free);
3667                         list->count--;
3668                         return 0;
3669                 }
3670         }
3671         return -ENOENT;
3672 }
3673
3674 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3675                                   struct netdev_hw_addr_list *from_list,
3676                                   int addr_len,
3677                                   unsigned char addr_type)
3678 {
3679         int err;
3680         struct netdev_hw_addr *ha, *ha2;
3681         unsigned char type;
3682
3683         list_for_each_entry(ha, &from_list->list, list) {
3684                 type = addr_type ? addr_type : ha->type;
3685                 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3686                 if (err)
3687                         goto unroll;
3688         }
3689         return 0;
3690
3691 unroll:
3692         list_for_each_entry(ha2, &from_list->list, list) {
3693                 if (ha2 == ha)
3694                         break;
3695                 type = addr_type ? addr_type : ha2->type;
3696                 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3697         }
3698         return err;
3699 }
3700
3701 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3702                                    struct netdev_hw_addr_list *from_list,
3703                                    int addr_len,
3704                                    unsigned char addr_type)
3705 {
3706         struct netdev_hw_addr *ha;
3707         unsigned char type;
3708
3709         list_for_each_entry(ha, &from_list->list, list) {
3710                 type = addr_type ? addr_type : ha->type;
3711                 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3712         }
3713 }
3714
3715 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3716                           struct netdev_hw_addr_list *from_list,
3717                           int addr_len)
3718 {
3719         int err = 0;
3720         struct netdev_hw_addr *ha, *tmp;
3721
3722         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3723                 if (!ha->synced) {
3724                         err = __hw_addr_add(to_list, ha->addr,
3725                                             addr_len, ha->type);
3726                         if (err)
3727                                 break;
3728                         ha->synced = true;
3729                         ha->refcount++;
3730                 } else if (ha->refcount == 1) {
3731                         __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3732                         __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3733                 }
3734         }
3735         return err;
3736 }
3737
3738 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3739                              struct netdev_hw_addr_list *from_list,
3740                              int addr_len)
3741 {
3742         struct netdev_hw_addr *ha, *tmp;
3743
3744         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3745                 if (ha->synced) {
3746                         __hw_addr_del(to_list, ha->addr,
3747                                       addr_len, ha->type);
3748                         ha->synced = false;
3749                         __hw_addr_del(from_list, ha->addr,
3750                                       addr_len, ha->type);
3751                 }
3752         }
3753 }
3754
3755 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3756 {
3757         struct netdev_hw_addr *ha, *tmp;
3758
3759         list_for_each_entry_safe(ha, tmp, &list->list, list) {
3760                 list_del_rcu(&ha->list);
3761                 call_rcu(&ha->rcu_head, ha_rcu_free);
3762         }
3763         list->count = 0;
3764 }
3765
3766 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3767 {
3768         INIT_LIST_HEAD(&list->list);
3769         list->count = 0;
3770 }
3771
3772 /* Device addresses handling functions */
3773
3774 static void dev_addr_flush(struct net_device *dev)
3775 {
3776         /* rtnl_mutex must be held here */
3777
3778         __hw_addr_flush(&dev->dev_addrs);
3779         dev->dev_addr = NULL;
3780 }
3781
3782 static int dev_addr_init(struct net_device *dev)
3783 {
3784         unsigned char addr[MAX_ADDR_LEN];
3785         struct netdev_hw_addr *ha;
3786         int err;
3787
3788         /* rtnl_mutex must be held here */
3789
3790         __hw_addr_init(&dev->dev_addrs);
3791         memset(addr, 0, sizeof(addr));
3792         err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3793                             NETDEV_HW_ADDR_T_LAN);
3794         if (!err) {
3795                 /*
3796                  * Get the first (previously created) address from the list
3797                  * and set dev_addr pointer to this location.
3798                  */
3799                 ha = list_first_entry(&dev->dev_addrs.list,
3800                                       struct netdev_hw_addr, list);
3801                 dev->dev_addr = ha->addr;
3802         }
3803         return err;
3804 }
3805
3806 /**
3807  *      dev_addr_add    - Add a device address
3808  *      @dev: device
3809  *      @addr: address to add
3810  *      @addr_type: address type
3811  *
3812  *      Add a device address to the device or increase the reference count if
3813  *      it already exists.
3814  *
3815  *      The caller must hold the rtnl_mutex.
3816  */
3817 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3818                  unsigned char addr_type)
3819 {
3820         int err;
3821
3822         ASSERT_RTNL();
3823
3824         err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3825         if (!err)
3826                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3827         return err;
3828 }
3829 EXPORT_SYMBOL(dev_addr_add);
3830
3831 /**
3832  *      dev_addr_del    - Release a device address.
3833  *      @dev: device
3834  *      @addr: address to delete
3835  *      @addr_type: address type
3836  *
3837  *      Release reference to a device address and remove it from the device
3838  *      if the reference count drops to zero.
3839  *
3840  *      The caller must hold the rtnl_mutex.
3841  */
3842 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3843                  unsigned char addr_type)
3844 {
3845         int err;
3846         struct netdev_hw_addr *ha;
3847
3848         ASSERT_RTNL();
3849
3850         /*
3851          * We can not remove the first address from the list because
3852          * dev->dev_addr points to that.
3853          */
3854         ha = list_first_entry(&dev->dev_addrs.list,
3855                               struct netdev_hw_addr, list);
3856         if (ha->addr == dev->dev_addr && ha->refcount == 1)
3857                 return -ENOENT;
3858
3859         err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3860                             addr_type);
3861         if (!err)
3862                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3863         return err;
3864 }
3865 EXPORT_SYMBOL(dev_addr_del);
3866
3867 /**
3868  *      dev_addr_add_multiple   - Add device addresses from another device
3869  *      @to_dev: device to which addresses will be added
3870  *      @from_dev: device from which addresses will be added
3871  *      @addr_type: address type - 0 means type will be used from from_dev
3872  *
3873  *      Add device addresses of the one device to another.
3874  **
3875  *      The caller must hold the rtnl_mutex.
3876  */
3877 int dev_addr_add_multiple(struct net_device *to_dev,
3878                           struct net_device *from_dev,
3879                           unsigned char addr_type)
3880 {
3881         int err;
3882
3883         ASSERT_RTNL();
3884
3885         if (from_dev->addr_len != to_dev->addr_len)
3886                 return -EINVAL;
3887         err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3888                                      to_dev->addr_len, addr_type);
3889         if (!err)
3890                 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3891         return err;
3892 }
3893 EXPORT_SYMBOL(dev_addr_add_multiple);
3894
3895 /**
3896  *      dev_addr_del_multiple   - Delete device addresses by another device
3897  *      @to_dev: device where the addresses will be deleted
3898  *      @from_dev: device by which addresses the addresses will be deleted
3899  *      @addr_type: address type - 0 means type will used from from_dev
3900  *
3901  *      Deletes addresses in to device by the list of addresses in from device.
3902  *
3903  *      The caller must hold the rtnl_mutex.
3904  */
3905 int dev_addr_del_multiple(struct net_device *to_dev,
3906                           struct net_device *from_dev,
3907                           unsigned char addr_type)
3908 {
3909         ASSERT_RTNL();
3910
3911         if (from_dev->addr_len != to_dev->addr_len)
3912                 return -EINVAL;
3913         __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3914                                to_dev->addr_len, addr_type);
3915         call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3916         return 0;
3917 }
3918 EXPORT_SYMBOL(dev_addr_del_multiple);
3919
3920 /* multicast addresses handling functions */
3921
3922 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3923                       void *addr, int alen, int glbl)
3924 {
3925         struct dev_addr_list *da;
3926
3927         for (; (da = *list) != NULL; list = &da->next) {
3928                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3929                     alen == da->da_addrlen) {
3930                         if (glbl) {
3931                                 int old_glbl = da->da_gusers;
3932                                 da->da_gusers = 0;
3933                                 if (old_glbl == 0)
3934                                         break;
3935                         }
3936                         if (--da->da_users)
3937                                 return 0;
3938
3939                         *list = da->next;
3940                         kfree(da);
3941                         (*count)--;
3942                         return 0;
3943                 }
3944         }
3945         return -ENOENT;
3946 }
3947
3948 int __dev_addr_add(struct dev_addr_list **list, int *count,
3949                    void *addr, int alen, int glbl)
3950 {
3951         struct dev_addr_list *da;
3952
3953         for (da = *list; da != NULL; da = da->next) {
3954                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3955                     da->da_addrlen == alen) {
3956                         if (glbl) {
3957                                 int old_glbl = da->da_gusers;
3958                                 da->da_gusers = 1;
3959                                 if (old_glbl)
3960                                         return 0;
3961                         }
3962                         da->da_users++;
3963                         return 0;
3964                 }
3965         }
3966
3967         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3968         if (da == NULL)
3969                 return -ENOMEM;
3970         memcpy(da->da_addr, addr, alen);
3971         da->da_addrlen = alen;
3972         da->da_users = 1;
3973         da->da_gusers = glbl ? 1 : 0;
3974         da->next = *list;
3975         *list = da;
3976         (*count)++;
3977         return 0;
3978 }
3979
3980 /**
3981  *      dev_unicast_delete      - Release secondary unicast address.
3982  *      @dev: device
3983  *      @addr: address to delete
3984  *
3985  *      Release reference to a secondary unicast address and remove it
3986  *      from the device if the reference count drops to zero.
3987  *
3988  *      The caller must hold the rtnl_mutex.
3989  */
3990 int dev_unicast_delete(struct net_device *dev, void *addr)
3991 {
3992         int err;
3993
3994         ASSERT_RTNL();
3995
3996         netif_addr_lock_bh(dev);
3997         err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
3998                             NETDEV_HW_ADDR_T_UNICAST);
3999         if (!err)
4000                 __dev_set_rx_mode(dev);
4001         netif_addr_unlock_bh(dev);
4002         return err;
4003 }
4004 EXPORT_SYMBOL(dev_unicast_delete);
4005
4006 /**
4007  *      dev_unicast_add         - add a secondary unicast address
4008  *      @dev: device
4009  *      @addr: address to add
4010  *
4011  *      Add a secondary unicast address to the device or increase
4012  *      the reference count if it already exists.
4013  *
4014  *      The caller must hold the rtnl_mutex.
4015  */
4016 int dev_unicast_add(struct net_device *dev, void *addr)
4017 {
4018         int err;
4019
4020         ASSERT_RTNL();
4021
4022         netif_addr_lock_bh(dev);
4023         err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4024                             NETDEV_HW_ADDR_T_UNICAST);
4025         if (!err)
4026                 __dev_set_rx_mode(dev);
4027         netif_addr_unlock_bh(dev);
4028         return err;
4029 }
4030 EXPORT_SYMBOL(dev_unicast_add);
4031
4032 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4033                     struct dev_addr_list **from, int *from_count)
4034 {
4035         struct dev_addr_list *da, *next;
4036         int err = 0;
4037
4038         da = *from;
4039         while (da != NULL) {
4040                 next = da->next;
4041                 if (!da->da_synced) {
4042                         err = __dev_addr_add(to, to_count,
4043                                              da->da_addr, da->da_addrlen, 0);
4044                         if (err < 0)
4045                                 break;
4046                         da->da_synced = 1;
4047                         da->da_users++;
4048                 } else if (da->da_users == 1) {
4049                         __dev_addr_delete(to, to_count,
4050                                           da->da_addr, da->da_addrlen, 0);
4051                         __dev_addr_delete(from, from_count,
4052                                           da->da_addr, da->da_addrlen, 0);
4053                 }
4054                 da = next;
4055         }
4056         return err;
4057 }
4058 EXPORT_SYMBOL_GPL(__dev_addr_sync);
4059
4060 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4061                        struct dev_addr_list **from, int *from_count)
4062 {
4063         struct dev_addr_list *da, *next;
4064
4065         da = *from;
4066         while (da != NULL) {
4067                 next = da->next;
4068                 if (da->da_synced) {
4069                         __dev_addr_delete(to, to_count,
4070                                           da->da_addr, da->da_addrlen, 0);
4071                         da->da_synced = 0;
4072                         __dev_addr_delete(from, from_count,
4073                                           da->da_addr, da->da_addrlen, 0);
4074                 }
4075                 da = next;
4076         }
4077 }
4078 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4079
4080 /**
4081  *      dev_unicast_sync - Synchronize device's unicast list to another device
4082  *      @to: destination device
4083  *      @from: source device
4084  *
4085  *      Add newly added addresses to the destination device and release
4086  *      addresses that have no users left. The source device must be
4087  *      locked by netif_tx_lock_bh.
4088  *
4089  *      This function is intended to be called from the dev->set_rx_mode
4090  *      function of layered software devices.
4091  */
4092 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4093 {
4094         int err = 0;
4095
4096         if (to->addr_len != from->addr_len)
4097                 return -EINVAL;
4098
4099         netif_addr_lock_bh(to);
4100         err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4101         if (!err)
4102                 __dev_set_rx_mode(to);
4103         netif_addr_unlock_bh(to);
4104         return err;
4105 }
4106 EXPORT_SYMBOL(dev_unicast_sync);
4107
4108 /**
4109  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4110  *      @to: destination device
4111  *      @from: source device
4112  *
4113  *      Remove all addresses that were added to the destination device by
4114  *      dev_unicast_sync(). This function is intended to be called from the
4115  *      dev->stop function of layered software devices.
4116  */
4117 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4118 {
4119         if (to->addr_len != from->addr_len)
4120                 return;
4121
4122         netif_addr_lock_bh(from);
4123         netif_addr_lock(to);
4124         __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4125         __dev_set_rx_mode(to);
4126         netif_addr_unlock(to);
4127         netif_addr_unlock_bh(from);
4128 }
4129 EXPORT_SYMBOL(dev_unicast_unsync);
4130
4131 static void dev_unicast_flush(struct net_device *dev)
4132 {
4133         netif_addr_lock_bh(dev);
4134         __hw_addr_flush(&dev->uc);
4135         netif_addr_unlock_bh(dev);
4136 }
4137
4138 static void dev_unicast_init(struct net_device *dev)
4139 {
4140         __hw_addr_init(&dev->uc);
4141 }
4142
4143
4144 static void __dev_addr_discard(struct dev_addr_list **list)
4145 {
4146         struct dev_addr_list *tmp;
4147
4148         while (*list != NULL) {
4149                 tmp = *list;
4150                 *list = tmp->next;
4151                 if (tmp->da_users > tmp->da_gusers)
4152                         printk("__dev_addr_discard: address leakage! "
4153                                "da_users=%d\n", tmp->da_users);
4154                 kfree(tmp);
4155         }
4156 }
4157
4158 static void dev_addr_discard(struct net_device *dev)
4159 {
4160         netif_addr_lock_bh(dev);
4161
4162         __dev_addr_discard(&dev->mc_list);
4163         dev->mc_count = 0;
4164
4165         netif_addr_unlock_bh(dev);
4166 }
4167
4168 /**
4169  *      dev_get_flags - get flags reported to userspace
4170  *      @dev: device
4171  *
4172  *      Get the combination of flag bits exported through APIs to userspace.
4173  */
4174 unsigned dev_get_flags(const struct net_device *dev)
4175 {
4176         unsigned flags;
4177
4178         flags = (dev->flags & ~(IFF_PROMISC |
4179                                 IFF_ALLMULTI |
4180                                 IFF_RUNNING |
4181                                 IFF_LOWER_UP |
4182                                 IFF_DORMANT)) |
4183                 (dev->gflags & (IFF_PROMISC |
4184                                 IFF_ALLMULTI));
4185
4186         if (netif_running(dev)) {
4187                 if (netif_oper_up(dev))
4188                         flags |= IFF_RUNNING;
4189                 if (netif_carrier_ok(dev))
4190                         flags |= IFF_LOWER_UP;
4191                 if (netif_dormant(dev))
4192                         flags |= IFF_DORMANT;
4193         }
4194
4195         return flags;
4196 }
4197 EXPORT_SYMBOL(dev_get_flags);
4198
4199 /**
4200  *      dev_change_flags - change device settings
4201  *      @dev: device
4202  *      @flags: device state flags
4203  *
4204  *      Change settings on device based state flags. The flags are
4205  *      in the userspace exported format.
4206  */
4207 int dev_change_flags(struct net_device *dev, unsigned flags)
4208 {
4209         int ret, changes;
4210         int old_flags = dev->flags;
4211
4212         ASSERT_RTNL();
4213
4214         /*
4215          *      Set the flags on our device.
4216          */
4217
4218         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4219                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4220                                IFF_AUTOMEDIA)) |
4221                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4222                                     IFF_ALLMULTI));
4223
4224         /*
4225          *      Load in the correct multicast list now the flags have changed.
4226          */
4227
4228         if ((old_flags ^ flags) & IFF_MULTICAST)
4229                 dev_change_rx_flags(dev, IFF_MULTICAST);
4230
4231         dev_set_rx_mode(dev);
4232
4233         /*
4234          *      Have we downed the interface. We handle IFF_UP ourselves
4235          *      according to user attempts to set it, rather than blindly
4236          *      setting it.
4237          */
4238
4239         ret = 0;
4240         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4241                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4242
4243                 if (!ret)
4244                         dev_set_rx_mode(dev);
4245         }
4246
4247         if (dev->flags & IFF_UP &&
4248             ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4249                                           IFF_VOLATILE)))
4250                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4251
4252         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4253                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4254
4255                 dev->gflags ^= IFF_PROMISC;
4256                 dev_set_promiscuity(dev, inc);
4257         }
4258
4259         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4260            is important. Some (broken) drivers set IFF_PROMISC, when
4261            IFF_ALLMULTI is requested not asking us and not reporting.
4262          */
4263         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4264                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4265
4266                 dev->gflags ^= IFF_ALLMULTI;
4267                 dev_set_allmulti(dev, inc);
4268         }
4269
4270         /* Exclude state transition flags, already notified */
4271         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4272         if (changes)
4273                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4274
4275         return ret;
4276 }
4277 EXPORT_SYMBOL(dev_change_flags);
4278
4279 /**
4280  *      dev_set_mtu - Change maximum transfer unit
4281  *      @dev: device
4282  *      @new_mtu: new transfer unit
4283  *
4284  *      Change the maximum transfer size of the network device.
4285  */
4286 int dev_set_mtu(struct net_device *dev, int new_mtu)
4287 {
4288         const struct net_device_ops *ops = dev->netdev_ops;
4289         int err;
4290
4291         if (new_mtu == dev->mtu)
4292                 return 0;
4293
4294         /*      MTU must be positive.    */
4295         if (new_mtu < 0)
4296                 return -EINVAL;
4297
4298         if (!netif_device_present(dev))
4299                 return -ENODEV;
4300
4301         err = 0;
4302         if (ops->ndo_change_mtu)
4303                 err = ops->ndo_change_mtu(dev, new_mtu);
4304         else
4305                 dev->mtu = new_mtu;
4306
4307         if (!err && dev->flags & IFF_UP)
4308                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4309         return err;
4310 }
4311 EXPORT_SYMBOL(dev_set_mtu);
4312
4313 /**
4314  *      dev_set_mac_address - Change Media Access Control Address
4315  *      @dev: device
4316  *      @sa: new address
4317  *
4318  *      Change the hardware (MAC) address of the device
4319  */
4320 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4321 {
4322         const struct net_device_ops *ops = dev->netdev_ops;
4323         int err;
4324
4325         if (!ops->ndo_set_mac_address)
4326                 return -EOPNOTSUPP;
4327         if (sa->sa_family != dev->type)
4328                 return -EINVAL;
4329         if (!netif_device_present(dev))
4330                 return -ENODEV;
4331         err = ops->ndo_set_mac_address(dev, sa);
4332         if (!err)
4333                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4334         return err;
4335 }
4336 EXPORT_SYMBOL(dev_set_mac_address);
4337
4338 /*
4339  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4340  */
4341 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4342 {
4343         int err;
4344         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4345
4346         if (!dev)
4347                 return -ENODEV;
4348
4349         switch (cmd) {
4350         case SIOCGIFFLAGS:      /* Get interface flags */
4351                 ifr->ifr_flags = (short) dev_get_flags(dev);
4352                 return 0;
4353
4354         case SIOCGIFMETRIC:     /* Get the metric on the interface
4355                                    (currently unused) */
4356                 ifr->ifr_metric = 0;
4357                 return 0;
4358
4359         case SIOCGIFMTU:        /* Get the MTU of a device */
4360                 ifr->ifr_mtu = dev->mtu;
4361                 return 0;
4362
4363         case SIOCGIFHWADDR:
4364                 if (!dev->addr_len)
4365                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4366                 else
4367                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4368                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4369                 ifr->ifr_hwaddr.sa_family = dev->type;
4370                 return 0;
4371
4372         case SIOCGIFSLAVE:
4373                 err = -EINVAL;
4374                 break;
4375
4376         case SIOCGIFMAP:
4377                 ifr->ifr_map.mem_start = dev->mem_start;
4378                 ifr->ifr_map.mem_end   = dev->mem_end;
4379                 ifr->ifr_map.base_addr = dev->base_addr;
4380                 ifr->ifr_map.irq       = dev->irq;
4381                 ifr->ifr_map.dma       = dev->dma;
4382                 ifr->ifr_map.port      = dev->if_port;
4383                 return 0;
4384
4385         case SIOCGIFINDEX:
4386                 ifr->ifr_ifindex = dev->ifindex;
4387                 return 0;
4388
4389         case SIOCGIFTXQLEN:
4390                 ifr->ifr_qlen = dev->tx_queue_len;
4391                 return 0;
4392
4393         default:
4394                 /* dev_ioctl() should ensure this case
4395                  * is never reached
4396                  */
4397                 WARN_ON(1);
4398                 err = -EINVAL;
4399                 break;
4400
4401         }
4402         return err;
4403 }
4404
4405 /*
4406  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4407  */
4408 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4409 {
4410         int err;
4411         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4412         const struct net_device_ops *ops;
4413
4414         if (!dev)
4415                 return -ENODEV;
4416
4417         ops = dev->netdev_ops;
4418
4419         switch (cmd) {
4420         case SIOCSIFFLAGS:      /* Set interface flags */
4421                 return dev_change_flags(dev, ifr->ifr_flags);
4422
4423         case SIOCSIFMETRIC:     /* Set the metric on the interface
4424                                    (currently unused) */
4425                 return -EOPNOTSUPP;
4426
4427         case SIOCSIFMTU:        /* Set the MTU of a device */
4428                 return dev_set_mtu(dev, ifr->ifr_mtu);
4429
4430         case SIOCSIFHWADDR:
4431                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4432
4433         case SIOCSIFHWBROADCAST:
4434                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4435                         return -EINVAL;
4436                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4437                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4438                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4439                 return 0;
4440
4441         case SIOCSIFMAP:
4442                 if (ops->ndo_set_config) {
4443                         if (!netif_device_present(dev))
4444                                 return -ENODEV;
4445                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4446                 }
4447                 return -EOPNOTSUPP;
4448
4449         case SIOCADDMULTI:
4450                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4451                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4452                         return -EINVAL;
4453                 if (!netif_device_present(dev))
4454                         return -ENODEV;
4455                 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4456                                   dev->addr_len, 1);
4457
4458         case SIOCDELMULTI:
4459                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4460                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4461                         return -EINVAL;
4462                 if (!netif_device_present(dev))
4463                         return -ENODEV;
4464                 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4465                                      dev->addr_len, 1);
4466
4467         case SIOCSIFTXQLEN:
4468                 if (ifr->ifr_qlen < 0)
4469                         return -EINVAL;
4470                 dev->tx_queue_len = ifr->ifr_qlen;
4471                 return 0;
4472
4473         case SIOCSIFNAME:
4474                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4475                 return dev_change_name(dev, ifr->ifr_newname);
4476
4477         /*
4478          *      Unknown or private ioctl
4479          */
4480         default:
4481                 if ((cmd >= SIOCDEVPRIVATE &&
4482                     cmd <= SIOCDEVPRIVATE + 15) ||
4483                     cmd == SIOCBONDENSLAVE ||
4484                     cmd == SIOCBONDRELEASE ||
4485                     cmd == SIOCBONDSETHWADDR ||
4486                     cmd == SIOCBONDSLAVEINFOQUERY ||
4487                     cmd == SIOCBONDINFOQUERY ||
4488                     cmd == SIOCBONDCHANGEACTIVE ||
4489                     cmd == SIOCGMIIPHY ||
4490                     cmd == SIOCGMIIREG ||
4491                     cmd == SIOCSMIIREG ||
4492                     cmd == SIOCBRADDIF ||
4493                     cmd == SIOCBRDELIF ||
4494                     cmd == SIOCSHWTSTAMP ||
4495                     cmd == SIOCWANDEV) {
4496                         err = -EOPNOTSUPP;
4497                         if (ops->ndo_do_ioctl) {
4498                                 if (netif_device_present(dev))
4499                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4500                                 else
4501                                         err = -ENODEV;
4502                         }
4503                 } else
4504                         err = -EINVAL;
4505
4506         }
4507         return err;
4508 }
4509
4510 /*
4511  *      This function handles all "interface"-type I/O control requests. The actual
4512  *      'doing' part of this is dev_ifsioc above.
4513  */
4514
4515 /**
4516  *      dev_ioctl       -       network device ioctl
4517  *      @net: the applicable net namespace
4518  *      @cmd: command to issue
4519  *      @arg: pointer to a struct ifreq in user space
4520  *
4521  *      Issue ioctl functions to devices. This is normally called by the
4522  *      user space syscall interfaces but can sometimes be useful for
4523  *      other purposes. The return value is the return from the syscall if
4524  *      positive or a negative errno code on error.
4525  */
4526
4527 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4528 {
4529         struct ifreq ifr;
4530         int ret;
4531         char *colon;
4532
4533         /* One special case: SIOCGIFCONF takes ifconf argument
4534            and requires shared lock, because it sleeps writing
4535            to user space.
4536          */
4537
4538         if (cmd == SIOCGIFCONF) {
4539                 rtnl_lock();
4540                 ret = dev_ifconf(net, (char __user *) arg);
4541                 rtnl_unlock();
4542                 return ret;
4543         }
4544         if (cmd == SIOCGIFNAME)
4545                 return dev_ifname(net, (struct ifreq __user *)arg);
4546
4547         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4548                 return -EFAULT;
4549
4550         ifr.ifr_name[IFNAMSIZ-1] = 0;
4551
4552         colon = strchr(ifr.ifr_name, ':');
4553         if (colon)
4554                 *colon = 0;
4555
4556         /*
4557          *      See which interface the caller is talking about.
4558          */
4559
4560         switch (cmd) {
4561         /*
4562          *      These ioctl calls:
4563          *      - can be done by all.
4564          *      - atomic and do not require locking.
4565          *      - return a value
4566          */
4567         case SIOCGIFFLAGS:
4568         case SIOCGIFMETRIC:
4569         case SIOCGIFMTU:
4570         case SIOCGIFHWADDR:
4571         case SIOCGIFSLAVE:
4572         case SIOCGIFMAP:
4573         case SIOCGIFINDEX:
4574         case SIOCGIFTXQLEN:
4575                 dev_load(net, ifr.ifr_name);
4576                 rcu_read_lock();
4577                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4578                 rcu_read_unlock();
4579                 if (!ret) {
4580                         if (colon)
4581                                 *colon = ':';
4582                         if (copy_to_user(arg, &ifr,
4583                                          sizeof(struct ifreq)))
4584                                 ret = -EFAULT;
4585                 }
4586                 return ret;
4587
4588         case SIOCETHTOOL:
4589                 dev_load(net, ifr.ifr_name);
4590                 rtnl_lock();
4591                 ret = dev_ethtool(net, &ifr);
4592                 rtnl_unlock();
4593                 if (!ret) {
4594                         if (colon)
4595                                 *colon = ':';
4596                         if (copy_to_user(arg, &ifr,
4597                                          sizeof(struct ifreq)))
4598                                 ret = -EFAULT;
4599                 }
4600                 return ret;
4601
4602         /*
4603          *      These ioctl calls:
4604          *      - require superuser power.
4605          *      - require strict serialization.
4606          *      - return a value
4607          */
4608         case SIOCGMIIPHY:
4609         case SIOCGMIIREG:
4610         case SIOCSIFNAME:
4611                 if (!capable(CAP_NET_ADMIN))
4612                         return -EPERM;
4613                 dev_load(net, ifr.ifr_name);
4614                 rtnl_lock();
4615                 ret = dev_ifsioc(net, &ifr, cmd);
4616                 rtnl_unlock();
4617                 if (!ret) {
4618                         if (colon)
4619                                 *colon = ':';
4620                         if (copy_to_user(arg, &ifr,
4621                                          sizeof(struct ifreq)))
4622                                 ret = -EFAULT;
4623                 }
4624                 return ret;
4625
4626         /*
4627          *      These ioctl calls:
4628          *      - require superuser power.
4629          *      - require strict serialization.
4630          *      - do not return a value
4631          */
4632         case SIOCSIFFLAGS:
4633         case SIOCSIFMETRIC:
4634         case SIOCSIFMTU:
4635         case SIOCSIFMAP:
4636         case SIOCSIFHWADDR:
4637         case SIOCSIFSLAVE:
4638         case SIOCADDMULTI:
4639         case SIOCDELMULTI:
4640         case SIOCSIFHWBROADCAST:
4641         case SIOCSIFTXQLEN:
4642         case SIOCSMIIREG:
4643         case SIOCBONDENSLAVE:
4644         case SIOCBONDRELEASE:
4645         case SIOCBONDSETHWADDR:
4646         case SIOCBONDCHANGEACTIVE:
4647         case SIOCBRADDIF:
4648         case SIOCBRDELIF:
4649         case SIOCSHWTSTAMP:
4650                 if (!capable(CAP_NET_ADMIN))
4651                         return -EPERM;
4652                 /* fall through */
4653         case SIOCBONDSLAVEINFOQUERY:
4654         case SIOCBONDINFOQUERY:
4655                 dev_load(net, ifr.ifr_name);
4656                 rtnl_lock();
4657                 ret = dev_ifsioc(net, &ifr, cmd);
4658                 rtnl_unlock();
4659                 return ret;
4660
4661         case SIOCGIFMEM:
4662                 /* Get the per device memory space. We can add this but
4663                  * currently do not support it */
4664         case SIOCSIFMEM:
4665                 /* Set the per device memory buffer space.
4666                  * Not applicable in our case */
4667         case SIOCSIFLINK:
4668                 return -EINVAL;
4669
4670         /*
4671          *      Unknown or private ioctl.
4672          */
4673         default:
4674                 if (cmd == SIOCWANDEV ||
4675                     (cmd >= SIOCDEVPRIVATE &&
4676                      cmd <= SIOCDEVPRIVATE + 15)) {
4677                         dev_load(net, ifr.ifr_name);
4678                         rtnl_lock();
4679                         ret = dev_ifsioc(net, &ifr, cmd);
4680                         rtnl_unlock();
4681                         if (!ret && copy_to_user(arg, &ifr,
4682                                                  sizeof(struct ifreq)))
4683                                 ret = -EFAULT;
4684                         return ret;
4685                 }
4686                 /* Take care of Wireless Extensions */
4687                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4688                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4689                 return -EINVAL;
4690         }
4691 }
4692
4693
4694 /**
4695  *      dev_new_index   -       allocate an ifindex
4696  *      @net: the applicable net namespace
4697  *
4698  *      Returns a suitable unique value for a new device interface
4699  *      number.  The caller must hold the rtnl semaphore or the
4700  *      dev_base_lock to be sure it remains unique.
4701  */
4702 static int dev_new_index(struct net *net)
4703 {
4704         static int ifindex;
4705         for (;;) {
4706                 if (++ifindex <= 0)
4707                         ifindex = 1;
4708                 if (!__dev_get_by_index(net, ifindex))
4709                         return ifindex;
4710         }
4711 }
4712
4713 /* Delayed registration/unregisteration */
4714 static LIST_HEAD(net_todo_list);
4715
4716 static void net_set_todo(struct net_device *dev)
4717 {
4718         list_add_tail(&dev->todo_list, &net_todo_list);
4719 }
4720
4721 static void rollback_registered_many(struct list_head *head)
4722 {
4723         struct net_device *dev;
4724
4725         BUG_ON(dev_boot_phase);
4726         ASSERT_RTNL();
4727
4728         list_for_each_entry(dev, head, unreg_list) {
4729                 /* Some devices call without registering
4730                  * for initialization unwind.
4731                  */
4732                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4733                         pr_debug("unregister_netdevice: device %s/%p never "
4734                                  "was registered\n", dev->name, dev);
4735
4736                         WARN_ON(1);
4737                         return;
4738                 }
4739
4740                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4741
4742                 /* If device is running, close it first. */
4743                 dev_close(dev);
4744
4745                 /* And unlink it from device chain. */
4746                 unlist_netdevice(dev);
4747
4748                 dev->reg_state = NETREG_UNREGISTERING;
4749         }
4750
4751         synchronize_net();
4752
4753         list_for_each_entry(dev, head, unreg_list) {
4754                 /* Shutdown queueing discipline. */
4755                 dev_shutdown(dev);
4756
4757
4758                 /* Notify protocols, that we are about to destroy
4759                    this device. They should clean all the things.
4760                 */
4761                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4762
4763                 /*
4764                  *      Flush the unicast and multicast chains
4765                  */
4766                 dev_unicast_flush(dev);
4767                 dev_addr_discard(dev);
4768
4769                 if (dev->netdev_ops->ndo_uninit)
4770                         dev->netdev_ops->ndo_uninit(dev);
4771
4772                 /* Notifier chain MUST detach us from master device. */
4773                 WARN_ON(dev->master);
4774
4775                 /* Remove entries from kobject tree */
4776                 netdev_unregister_kobject(dev);
4777         }
4778
4779         synchronize_net();
4780
4781         list_for_each_entry(dev, head, unreg_list)
4782                 dev_put(dev);
4783 }
4784
4785 static void rollback_registered(struct net_device *dev)
4786 {
4787         LIST_HEAD(single);
4788
4789         list_add(&dev->unreg_list, &single);
4790         rollback_registered_many(&single);
4791 }
4792
4793 static void __netdev_init_queue_locks_one(struct net_device *dev,
4794                                           struct netdev_queue *dev_queue,
4795                                           void *_unused)
4796 {
4797         spin_lock_init(&dev_queue->_xmit_lock);
4798         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4799         dev_queue->xmit_lock_owner = -1;
4800 }
4801
4802 static void netdev_init_queue_locks(struct net_device *dev)
4803 {
4804         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4805         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4806 }
4807
4808 unsigned long netdev_fix_features(unsigned long features, const char *name)
4809 {
4810         /* Fix illegal SG+CSUM combinations. */
4811         if ((features & NETIF_F_SG) &&
4812             !(features & NETIF_F_ALL_CSUM)) {
4813                 if (name)
4814                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4815                                "checksum feature.\n", name);
4816                 features &= ~NETIF_F_SG;
4817         }
4818
4819         /* TSO requires that SG is present as well. */
4820         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4821                 if (name)
4822                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4823                                "SG feature.\n", name);
4824                 features &= ~NETIF_F_TSO;
4825         }
4826
4827         if (features & NETIF_F_UFO) {
4828                 if (!(features & NETIF_F_GEN_CSUM)) {
4829                         if (name)
4830                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4831                                        "since no NETIF_F_HW_CSUM feature.\n",
4832                                        name);
4833                         features &= ~NETIF_F_UFO;
4834                 }
4835
4836                 if (!(features & NETIF_F_SG)) {
4837                         if (name)
4838                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4839                                        "since no NETIF_F_SG feature.\n", name);
4840                         features &= ~NETIF_F_UFO;
4841                 }
4842         }
4843
4844         return features;
4845 }
4846 EXPORT_SYMBOL(netdev_fix_features);
4847
4848 /**
4849  *      register_netdevice      - register a network device
4850  *      @dev: device to register
4851  *
4852  *      Take a completed network device structure and add it to the kernel
4853  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4854  *      chain. 0 is returned on success. A negative errno code is returned
4855  *      on a failure to set up the device, or if the name is a duplicate.
4856  *
4857  *      Callers must hold the rtnl semaphore. You may want
4858  *      register_netdev() instead of this.
4859  *
4860  *      BUGS:
4861  *      The locking appears insufficient to guarantee two parallel registers
4862  *      will not get the same name.
4863  */
4864
4865 int register_netdevice(struct net_device *dev)
4866 {
4867         struct hlist_head *head;
4868         struct hlist_node *p;
4869         int ret;
4870         struct net *net = dev_net(dev);
4871
4872         BUG_ON(dev_boot_phase);
4873         ASSERT_RTNL();
4874
4875         might_sleep();
4876
4877         /* When net_device's are persistent, this will be fatal. */
4878         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4879         BUG_ON(!net);
4880
4881         spin_lock_init(&dev->addr_list_lock);
4882         netdev_set_addr_lockdep_class(dev);
4883         netdev_init_queue_locks(dev);
4884
4885         dev->iflink = -1;
4886
4887         /* Init, if this function is available */
4888         if (dev->netdev_ops->ndo_init) {
4889                 ret = dev->netdev_ops->ndo_init(dev);
4890                 if (ret) {
4891                         if (ret > 0)
4892                                 ret = -EIO;
4893                         goto out;
4894                 }
4895         }
4896
4897         if (!dev_valid_name(dev->name)) {
4898                 ret = -EINVAL;
4899                 goto err_uninit;
4900         }
4901
4902         dev->ifindex = dev_new_index(net);
4903         if (dev->iflink == -1)
4904                 dev->iflink = dev->ifindex;
4905
4906         /* Check for existence of name */
4907         head = dev_name_hash(net, dev->name);
4908         hlist_for_each(p, head) {
4909                 struct net_device *d
4910                         = hlist_entry(p, struct net_device, name_hlist);
4911                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4912                         ret = -EEXIST;
4913                         goto err_uninit;
4914                 }
4915         }
4916
4917         /* Fix illegal checksum combinations */
4918         if ((dev->features & NETIF_F_HW_CSUM) &&
4919             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4920                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4921                        dev->name);
4922                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4923         }
4924
4925         if ((dev->features & NETIF_F_NO_CSUM) &&
4926             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4927                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4928                        dev->name);
4929                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4930         }
4931
4932         dev->features = netdev_fix_features(dev->features, dev->name);
4933
4934         /* Enable software GSO if SG is supported. */
4935         if (dev->features & NETIF_F_SG)
4936                 dev->features |= NETIF_F_GSO;
4937
4938         netdev_initialize_kobject(dev);
4939
4940         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
4941         ret = notifier_to_errno(ret);
4942         if (ret)
4943                 goto err_uninit;
4944
4945         ret = netdev_register_kobject(dev);
4946         if (ret)
4947                 goto err_uninit;
4948         dev->reg_state = NETREG_REGISTERED;
4949
4950         /*
4951          *      Default initial state at registry is that the
4952          *      device is present.
4953          */
4954
4955         set_bit(__LINK_STATE_PRESENT, &dev->state);
4956
4957         dev_init_scheduler(dev);
4958         dev_hold(dev);
4959         list_netdevice(dev);
4960
4961         /* Notify protocols, that a new device appeared. */
4962         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4963         ret = notifier_to_errno(ret);
4964         if (ret) {
4965                 rollback_registered(dev);
4966                 dev->reg_state = NETREG_UNREGISTERED;
4967         }
4968
4969 out:
4970         return ret;
4971
4972 err_uninit:
4973         if (dev->netdev_ops->ndo_uninit)
4974                 dev->netdev_ops->ndo_uninit(dev);
4975         goto out;
4976 }
4977 EXPORT_SYMBOL(register_netdevice);
4978
4979 /**
4980  *      init_dummy_netdev       - init a dummy network device for NAPI
4981  *      @dev: device to init
4982  *
4983  *      This takes a network device structure and initialize the minimum
4984  *      amount of fields so it can be used to schedule NAPI polls without
4985  *      registering a full blown interface. This is to be used by drivers
4986  *      that need to tie several hardware interfaces to a single NAPI
4987  *      poll scheduler due to HW limitations.
4988  */
4989 int init_dummy_netdev(struct net_device *dev)
4990 {
4991         /* Clear everything. Note we don't initialize spinlocks
4992          * are they aren't supposed to be taken by any of the
4993          * NAPI code and this dummy netdev is supposed to be
4994          * only ever used for NAPI polls
4995          */
4996         memset(dev, 0, sizeof(struct net_device));
4997
4998         /* make sure we BUG if trying to hit standard
4999          * register/unregister code path
5000          */
5001         dev->reg_state = NETREG_DUMMY;
5002
5003         /* initialize the ref count */
5004         atomic_set(&dev->refcnt, 1);
5005
5006         /* NAPI wants this */
5007         INIT_LIST_HEAD(&dev->napi_list);
5008
5009         /* a dummy interface is started by default */
5010         set_bit(__LINK_STATE_PRESENT, &dev->state);
5011         set_bit(__LINK_STATE_START, &dev->state);
5012
5013         return 0;
5014 }
5015 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5016
5017
5018 /**
5019  *      register_netdev - register a network device
5020  *      @dev: device to register
5021  *
5022  *      Take a completed network device structure and add it to the kernel
5023  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5024  *      chain. 0 is returned on success. A negative errno code is returned
5025  *      on a failure to set up the device, or if the name is a duplicate.
5026  *
5027  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5028  *      and expands the device name if you passed a format string to
5029  *      alloc_netdev.
5030  */
5031 int register_netdev(struct net_device *dev)
5032 {
5033         int err;
5034
5035         rtnl_lock();
5036
5037         /*
5038          * If the name is a format string the caller wants us to do a
5039          * name allocation.
5040          */
5041         if (strchr(dev->name, '%')) {
5042                 err = dev_alloc_name(dev, dev->name);
5043                 if (err < 0)
5044                         goto out;
5045         }
5046
5047         err = register_netdevice(dev);
5048 out:
5049         rtnl_unlock();
5050         return err;
5051 }
5052 EXPORT_SYMBOL(register_netdev);
5053
5054 /*
5055  * netdev_wait_allrefs - wait until all references are gone.
5056  *
5057  * This is called when unregistering network devices.
5058  *
5059  * Any protocol or device that holds a reference should register
5060  * for netdevice notification, and cleanup and put back the
5061  * reference if they receive an UNREGISTER event.
5062  * We can get stuck here if buggy protocols don't correctly
5063  * call dev_put.
5064  */
5065 static void netdev_wait_allrefs(struct net_device *dev)
5066 {
5067         unsigned long rebroadcast_time, warning_time;
5068
5069         rebroadcast_time = warning_time = jiffies;
5070         while (atomic_read(&dev->refcnt) != 0) {
5071                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5072                         rtnl_lock();
5073
5074                         /* Rebroadcast unregister notification */
5075                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5076
5077                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5078                                      &dev->state)) {
5079                                 /* We must not have linkwatch events
5080                                  * pending on unregister. If this
5081                                  * happens, we simply run the queue
5082                                  * unscheduled, resulting in a noop
5083                                  * for this device.
5084                                  */
5085                                 linkwatch_run_queue();
5086                         }
5087
5088                         __rtnl_unlock();
5089
5090                         rebroadcast_time = jiffies;
5091                 }
5092
5093                 msleep(250);
5094
5095                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5096                         printk(KERN_EMERG "unregister_netdevice: "
5097                                "waiting for %s to become free. Usage "
5098                                "count = %d\n",
5099                                dev->name, atomic_read(&dev->refcnt));
5100                         warning_time = jiffies;
5101                 }
5102         }
5103 }
5104
5105 /* The sequence is:
5106  *
5107  *      rtnl_lock();
5108  *      ...
5109  *      register_netdevice(x1);
5110  *      register_netdevice(x2);
5111  *      ...
5112  *      unregister_netdevice(y1);
5113  *      unregister_netdevice(y2);
5114  *      ...
5115  *      rtnl_unlock();
5116  *      free_netdev(y1);
5117  *      free_netdev(y2);
5118  *
5119  * We are invoked by rtnl_unlock().
5120  * This allows us to deal with problems:
5121  * 1) We can delete sysfs objects which invoke hotplug
5122  *    without deadlocking with linkwatch via keventd.
5123  * 2) Since we run with the RTNL semaphore not held, we can sleep
5124  *    safely in order to wait for the netdev refcnt to drop to zero.
5125  *
5126  * We must not return until all unregister events added during
5127  * the interval the lock was held have been completed.
5128  */
5129 void netdev_run_todo(void)
5130 {
5131         struct list_head list;
5132
5133         /* Snapshot list, allow later requests */
5134         list_replace_init(&net_todo_list, &list);
5135
5136         __rtnl_unlock();
5137
5138         while (!list_empty(&list)) {
5139                 struct net_device *dev
5140                         = list_entry(list.next, struct net_device, todo_list);
5141                 list_del(&dev->todo_list);
5142
5143                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5144                         printk(KERN_ERR "network todo '%s' but state %d\n",
5145                                dev->name, dev->reg_state);
5146                         dump_stack();
5147                         continue;
5148                 }
5149
5150                 dev->reg_state = NETREG_UNREGISTERED;
5151
5152                 on_each_cpu(flush_backlog, dev, 1);
5153
5154                 netdev_wait_allrefs(dev);
5155
5156                 /* paranoia */
5157                 BUG_ON(atomic_read(&dev->refcnt));
5158                 WARN_ON(dev->ip_ptr);
5159                 WARN_ON(dev->ip6_ptr);
5160                 WARN_ON(dev->dn_ptr);
5161
5162                 if (dev->destructor)
5163                         dev->destructor(dev);
5164
5165                 /* Free network device */
5166                 kobject_put(&dev->dev.kobj);
5167         }
5168 }
5169
5170 /**
5171  *      dev_get_stats   - get network device statistics
5172  *      @dev: device to get statistics from
5173  *
5174  *      Get network statistics from device. The device driver may provide
5175  *      its own method by setting dev->netdev_ops->get_stats; otherwise
5176  *      the internal statistics structure is used.
5177  */
5178 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5179 {
5180         const struct net_device_ops *ops = dev->netdev_ops;
5181
5182         if (ops->ndo_get_stats)
5183                 return ops->ndo_get_stats(dev);
5184         else {
5185                 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5186                 struct net_device_stats *stats = &dev->stats;
5187                 unsigned int i;
5188                 struct netdev_queue *txq;
5189
5190                 for (i = 0; i < dev->num_tx_queues; i++) {
5191                         txq = netdev_get_tx_queue(dev, i);
5192                         tx_bytes   += txq->tx_bytes;
5193                         tx_packets += txq->tx_packets;
5194                         tx_dropped += txq->tx_dropped;
5195                 }
5196                 if (tx_bytes || tx_packets || tx_dropped) {
5197                         stats->tx_bytes   = tx_bytes;
5198                         stats->tx_packets = tx_packets;
5199                         stats->tx_dropped = tx_dropped;
5200                 }
5201                 return stats;
5202         }
5203 }
5204 EXPORT_SYMBOL(dev_get_stats);
5205
5206 static void netdev_init_one_queue(struct net_device *dev,
5207                                   struct netdev_queue *queue,
5208                                   void *_unused)
5209 {
5210         queue->dev = dev;
5211 }
5212
5213 static void netdev_init_queues(struct net_device *dev)
5214 {
5215         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5216         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5217         spin_lock_init(&dev->tx_global_lock);
5218 }
5219
5220 /**
5221  *      alloc_netdev_mq - allocate network device
5222  *      @sizeof_priv:   size of private data to allocate space for
5223  *      @name:          device name format string
5224  *      @setup:         callback to initialize device
5225  *      @queue_count:   the number of subqueues to allocate
5226  *
5227  *      Allocates a struct net_device with private data area for driver use
5228  *      and performs basic initialization.  Also allocates subquue structs
5229  *      for each queue on the device at the end of the netdevice.
5230  */
5231 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5232                 void (*setup)(struct net_device *), unsigned int queue_count)
5233 {
5234         struct netdev_queue *tx;
5235         struct net_device *dev;
5236         size_t alloc_size;
5237         struct net_device *p;
5238
5239         BUG_ON(strlen(name) >= sizeof(dev->name));
5240
5241         alloc_size = sizeof(struct net_device);
5242         if (sizeof_priv) {
5243                 /* ensure 32-byte alignment of private area */
5244                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5245                 alloc_size += sizeof_priv;
5246         }
5247         /* ensure 32-byte alignment of whole construct */
5248         alloc_size += NETDEV_ALIGN - 1;
5249
5250         p = kzalloc(alloc_size, GFP_KERNEL);
5251         if (!p) {
5252                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5253                 return NULL;
5254         }
5255
5256         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5257         if (!tx) {
5258                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5259                        "tx qdiscs.\n");
5260                 goto free_p;
5261         }
5262
5263         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5264         dev->padded = (char *)dev - (char *)p;
5265
5266         if (dev_addr_init(dev))
5267                 goto free_tx;
5268
5269         dev_unicast_init(dev);
5270
5271         dev_net_set(dev, &init_net);
5272
5273         dev->_tx = tx;
5274         dev->num_tx_queues = queue_count;
5275         dev->real_num_tx_queues = queue_count;
5276
5277         dev->gso_max_size = GSO_MAX_SIZE;
5278
5279         netdev_init_queues(dev);
5280
5281         INIT_LIST_HEAD(&dev->napi_list);
5282         INIT_LIST_HEAD(&dev->unreg_list);
5283         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5284         setup(dev);
5285         strcpy(dev->name, name);
5286         return dev;
5287
5288 free_tx:
5289         kfree(tx);
5290
5291 free_p:
5292         kfree(p);
5293         return NULL;
5294 }
5295 EXPORT_SYMBOL(alloc_netdev_mq);
5296
5297 /**
5298  *      free_netdev - free network device
5299  *      @dev: device
5300  *
5301  *      This function does the last stage of destroying an allocated device
5302  *      interface. The reference to the device object is released.
5303  *      If this is the last reference then it will be freed.
5304  */
5305 void free_netdev(struct net_device *dev)
5306 {
5307         struct napi_struct *p, *n;
5308
5309         release_net(dev_net(dev));
5310
5311         kfree(dev->_tx);
5312
5313         /* Flush device addresses */
5314         dev_addr_flush(dev);
5315
5316         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5317                 netif_napi_del(p);
5318
5319         /*  Compatibility with error handling in drivers */
5320         if (dev->reg_state == NETREG_UNINITIALIZED) {
5321                 kfree((char *)dev - dev->padded);
5322                 return;
5323         }
5324
5325         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5326         dev->reg_state = NETREG_RELEASED;
5327
5328         /* will free via device release */
5329         put_device(&dev->dev);
5330 }
5331 EXPORT_SYMBOL(free_netdev);
5332
5333 /**
5334  *      synchronize_net -  Synchronize with packet receive processing
5335  *
5336  *      Wait for packets currently being received to be done.
5337  *      Does not block later packets from starting.
5338  */
5339 void synchronize_net(void)
5340 {
5341         might_sleep();
5342         synchronize_rcu();
5343 }
5344 EXPORT_SYMBOL(synchronize_net);
5345
5346 /**
5347  *      unregister_netdevice_queue - remove device from the kernel
5348  *      @dev: device
5349  *      @head: list
5350
5351  *      This function shuts down a device interface and removes it
5352  *      from the kernel tables.
5353  *      If head not NULL, device is queued to be unregistered later.
5354  *
5355  *      Callers must hold the rtnl semaphore.  You may want
5356  *      unregister_netdev() instead of this.
5357  */
5358
5359 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5360 {
5361         ASSERT_RTNL();
5362
5363         if (head) {
5364                 list_move_tail(&dev->unreg_list, head);
5365         } else {
5366                 rollback_registered(dev);
5367                 /* Finish processing unregister after unlock */
5368                 net_set_todo(dev);
5369         }
5370 }
5371 EXPORT_SYMBOL(unregister_netdevice_queue);
5372
5373 /**
5374  *      unregister_netdevice_many - unregister many devices
5375  *      @head: list of devices
5376  *
5377  */
5378 void unregister_netdevice_many(struct list_head *head)
5379 {
5380         struct net_device *dev;
5381
5382         if (!list_empty(head)) {
5383                 rollback_registered_many(head);
5384                 list_for_each_entry(dev, head, unreg_list)
5385                         net_set_todo(dev);
5386         }
5387 }
5388 EXPORT_SYMBOL(unregister_netdevice_many);
5389
5390 /**
5391  *      unregister_netdev - remove device from the kernel
5392  *      @dev: device
5393  *
5394  *      This function shuts down a device interface and removes it
5395  *      from the kernel tables.
5396  *
5397  *      This is just a wrapper for unregister_netdevice that takes
5398  *      the rtnl semaphore.  In general you want to use this and not
5399  *      unregister_netdevice.
5400  */
5401 void unregister_netdev(struct net_device *dev)
5402 {
5403         rtnl_lock();
5404         unregister_netdevice(dev);
5405         rtnl_unlock();
5406 }
5407 EXPORT_SYMBOL(unregister_netdev);
5408
5409 /**
5410  *      dev_change_net_namespace - move device to different nethost namespace
5411  *      @dev: device
5412  *      @net: network namespace
5413  *      @pat: If not NULL name pattern to try if the current device name
5414  *            is already taken in the destination network namespace.
5415  *
5416  *      This function shuts down a device interface and moves it
5417  *      to a new network namespace. On success 0 is returned, on
5418  *      a failure a netagive errno code is returned.
5419  *
5420  *      Callers must hold the rtnl semaphore.
5421  */
5422
5423 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5424 {
5425         char buf[IFNAMSIZ];
5426         const char *destname;
5427         int err;
5428
5429         ASSERT_RTNL();
5430
5431         /* Don't allow namespace local devices to be moved. */
5432         err = -EINVAL;
5433         if (dev->features & NETIF_F_NETNS_LOCAL)
5434                 goto out;
5435
5436 #ifdef CONFIG_SYSFS
5437         /* Don't allow real devices to be moved when sysfs
5438          * is enabled.
5439          */
5440         err = -EINVAL;
5441         if (dev->dev.parent)
5442                 goto out;
5443 #endif
5444
5445         /* Ensure the device has been registrered */
5446         err = -EINVAL;
5447         if (dev->reg_state != NETREG_REGISTERED)
5448                 goto out;
5449
5450         /* Get out if there is nothing todo */
5451         err = 0;
5452         if (net_eq(dev_net(dev), net))
5453                 goto out;
5454
5455         /* Pick the destination device name, and ensure
5456          * we can use it in the destination network namespace.
5457          */
5458         err = -EEXIST;
5459         destname = dev->name;
5460         if (__dev_get_by_name(net, destname)) {
5461                 /* We get here if we can't use the current device name */
5462                 if (!pat)
5463                         goto out;
5464                 if (!dev_valid_name(pat))
5465                         goto out;
5466                 if (strchr(pat, '%')) {
5467                         if (__dev_alloc_name(net, pat, buf) < 0)
5468                                 goto out;
5469                         destname = buf;
5470                 } else
5471                         destname = pat;
5472                 if (__dev_get_by_name(net, destname))
5473                         goto out;
5474         }
5475
5476         /*
5477          * And now a mini version of register_netdevice unregister_netdevice.
5478          */
5479
5480         /* If device is running close it first. */
5481         dev_close(dev);
5482
5483         /* And unlink it from device chain */
5484         err = -ENODEV;
5485         unlist_netdevice(dev);
5486
5487         synchronize_net();
5488
5489         /* Shutdown queueing discipline. */
5490         dev_shutdown(dev);
5491
5492         /* Notify protocols, that we are about to destroy
5493            this device. They should clean all the things.
5494         */
5495         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5496
5497         /*
5498          *      Flush the unicast and multicast chains
5499          */
5500         dev_unicast_flush(dev);
5501         dev_addr_discard(dev);
5502
5503         netdev_unregister_kobject(dev);
5504
5505         /* Actually switch the network namespace */
5506         dev_net_set(dev, net);
5507
5508         /* Assign the new device name */
5509         if (destname != dev->name)
5510                 strcpy(dev->name, destname);
5511
5512         /* If there is an ifindex conflict assign a new one */
5513         if (__dev_get_by_index(net, dev->ifindex)) {
5514                 int iflink = (dev->iflink == dev->ifindex);
5515                 dev->ifindex = dev_new_index(net);
5516                 if (iflink)
5517                         dev->iflink = dev->ifindex;
5518         }
5519
5520         /* Fixup kobjects */
5521         err = netdev_register_kobject(dev);
5522         WARN_ON(err);
5523
5524         /* Add the device back in the hashes */
5525         list_netdevice(dev);
5526
5527         /* Notify protocols, that a new device appeared. */
5528         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5529
5530         synchronize_net();
5531         err = 0;
5532 out:
5533         return err;
5534 }
5535 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5536
5537 static int dev_cpu_callback(struct notifier_block *nfb,
5538                             unsigned long action,
5539                             void *ocpu)
5540 {
5541         struct sk_buff **list_skb;
5542         struct Qdisc **list_net;
5543         struct sk_buff *skb;
5544         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5545         struct softnet_data *sd, *oldsd;
5546
5547         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5548                 return NOTIFY_OK;
5549
5550         local_irq_disable();
5551         cpu = smp_processor_id();
5552         sd = &per_cpu(softnet_data, cpu);
5553         oldsd = &per_cpu(softnet_data, oldcpu);
5554
5555         /* Find end of our completion_queue. */
5556         list_skb = &sd->completion_queue;
5557         while (*list_skb)
5558                 list_skb = &(*list_skb)->next;
5559         /* Append completion queue from offline CPU. */
5560         *list_skb = oldsd->completion_queue;
5561         oldsd->completion_queue = NULL;
5562
5563         /* Find end of our output_queue. */
5564         list_net = &sd->output_queue;
5565         while (*list_net)
5566                 list_net = &(*list_net)->next_sched;
5567         /* Append output queue from offline CPU. */
5568         *list_net = oldsd->output_queue;
5569         oldsd->output_queue = NULL;
5570
5571         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5572         local_irq_enable();
5573
5574         /* Process offline CPU's input_pkt_queue */
5575         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5576                 netif_rx(skb);
5577
5578         return NOTIFY_OK;
5579 }
5580
5581
5582 /**
5583  *      netdev_increment_features - increment feature set by one
5584  *      @all: current feature set
5585  *      @one: new feature set
5586  *      @mask: mask feature set
5587  *
5588  *      Computes a new feature set after adding a device with feature set
5589  *      @one to the master device with current feature set @all.  Will not
5590  *      enable anything that is off in @mask. Returns the new feature set.
5591  */
5592 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5593                                         unsigned long mask)
5594 {
5595         /* If device needs checksumming, downgrade to it. */
5596         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5597                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5598         else if (mask & NETIF_F_ALL_CSUM) {
5599                 /* If one device supports v4/v6 checksumming, set for all. */
5600                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5601                     !(all & NETIF_F_GEN_CSUM)) {
5602                         all &= ~NETIF_F_ALL_CSUM;
5603                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5604                 }
5605
5606                 /* If one device supports hw checksumming, set for all. */
5607                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5608                         all &= ~NETIF_F_ALL_CSUM;
5609                         all |= NETIF_F_HW_CSUM;
5610                 }
5611         }
5612
5613         one |= NETIF_F_ALL_CSUM;
5614
5615         one |= all & NETIF_F_ONE_FOR_ALL;
5616         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5617         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5618
5619         return all;
5620 }
5621 EXPORT_SYMBOL(netdev_increment_features);
5622
5623 static struct hlist_head *netdev_create_hash(void)
5624 {
5625         int i;
5626         struct hlist_head *hash;
5627
5628         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5629         if (hash != NULL)
5630                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5631                         INIT_HLIST_HEAD(&hash[i]);
5632
5633         return hash;
5634 }
5635
5636 /* Initialize per network namespace state */
5637 static int __net_init netdev_init(struct net *net)
5638 {
5639         INIT_LIST_HEAD(&net->dev_base_head);
5640
5641         net->dev_name_head = netdev_create_hash();
5642         if (net->dev_name_head == NULL)
5643                 goto err_name;
5644
5645         net->dev_index_head = netdev_create_hash();
5646         if (net->dev_index_head == NULL)
5647                 goto err_idx;
5648
5649         return 0;
5650
5651 err_idx:
5652         kfree(net->dev_name_head);
5653 err_name:
5654         return -ENOMEM;
5655 }
5656
5657 /**
5658  *      netdev_drivername - network driver for the device
5659  *      @dev: network device
5660  *      @buffer: buffer for resulting name
5661  *      @len: size of buffer
5662  *
5663  *      Determine network driver for device.
5664  */
5665 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5666 {
5667         const struct device_driver *driver;
5668         const struct device *parent;
5669
5670         if (len <= 0 || !buffer)
5671                 return buffer;
5672         buffer[0] = 0;
5673
5674         parent = dev->dev.parent;
5675
5676         if (!parent)
5677                 return buffer;
5678
5679         driver = parent->driver;
5680         if (driver && driver->name)
5681                 strlcpy(buffer, driver->name, len);
5682         return buffer;
5683 }
5684
5685 static void __net_exit netdev_exit(struct net *net)
5686 {
5687         kfree(net->dev_name_head);
5688         kfree(net->dev_index_head);
5689 }
5690
5691 static struct pernet_operations __net_initdata netdev_net_ops = {
5692         .init = netdev_init,
5693         .exit = netdev_exit,
5694 };
5695
5696 static void __net_exit default_device_exit(struct net *net)
5697 {
5698         struct net_device *dev;
5699         /*
5700          * Push all migratable of the network devices back to the
5701          * initial network namespace
5702          */
5703         rtnl_lock();
5704 restart:
5705         for_each_netdev(net, dev) {
5706                 int err;
5707                 char fb_name[IFNAMSIZ];
5708
5709                 /* Ignore unmoveable devices (i.e. loopback) */
5710                 if (dev->features & NETIF_F_NETNS_LOCAL)
5711                         continue;
5712
5713                 /* Delete virtual devices */
5714                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5715                         dev->rtnl_link_ops->dellink(dev, NULL);
5716                         goto restart;
5717                 }
5718
5719                 /* Push remaing network devices to init_net */
5720                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5721                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5722                 if (err) {
5723                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5724                                 __func__, dev->name, err);
5725                         BUG();
5726                 }
5727                 goto restart;
5728         }
5729         rtnl_unlock();
5730 }
5731
5732 static struct pernet_operations __net_initdata default_device_ops = {
5733         .exit = default_device_exit,
5734 };
5735
5736 /*
5737  *      Initialize the DEV module. At boot time this walks the device list and
5738  *      unhooks any devices that fail to initialise (normally hardware not
5739  *      present) and leaves us with a valid list of present and active devices.
5740  *
5741  */
5742
5743 /*
5744  *       This is called single threaded during boot, so no need
5745  *       to take the rtnl semaphore.
5746  */
5747 static int __init net_dev_init(void)
5748 {
5749         int i, rc = -ENOMEM;
5750
5751         BUG_ON(!dev_boot_phase);
5752
5753         if (dev_proc_init())
5754                 goto out;
5755
5756         if (netdev_kobject_init())
5757                 goto out;
5758
5759         INIT_LIST_HEAD(&ptype_all);
5760         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5761                 INIT_LIST_HEAD(&ptype_base[i]);
5762
5763         if (register_pernet_subsys(&netdev_net_ops))
5764                 goto out;
5765
5766         /*
5767          *      Initialise the packet receive queues.
5768          */
5769
5770         for_each_possible_cpu(i) {
5771                 struct softnet_data *queue;
5772
5773                 queue = &per_cpu(softnet_data, i);
5774                 skb_queue_head_init(&queue->input_pkt_queue);
5775                 queue->completion_queue = NULL;
5776                 INIT_LIST_HEAD(&queue->poll_list);
5777
5778                 queue->backlog.poll = process_backlog;
5779                 queue->backlog.weight = weight_p;
5780                 queue->backlog.gro_list = NULL;
5781                 queue->backlog.gro_count = 0;
5782         }
5783
5784         dev_boot_phase = 0;
5785
5786         /* The loopback device is special if any other network devices
5787          * is present in a network namespace the loopback device must
5788          * be present. Since we now dynamically allocate and free the
5789          * loopback device ensure this invariant is maintained by
5790          * keeping the loopback device as the first device on the
5791          * list of network devices.  Ensuring the loopback devices
5792          * is the first device that appears and the last network device
5793          * that disappears.
5794          */
5795         if (register_pernet_device(&loopback_net_ops))
5796                 goto out;
5797
5798         if (register_pernet_device(&default_device_ops))
5799                 goto out;
5800
5801         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5802         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5803
5804         hotcpu_notifier(dev_cpu_callback, 0);
5805         dst_init();
5806         dev_mcast_init();
5807         rc = 0;
5808 out:
5809         return rc;
5810 }
5811
5812 subsys_initcall(net_dev_init);
5813
5814 static int __init initialize_hashrnd(void)
5815 {
5816         get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5817         return 0;
5818 }
5819
5820 late_initcall_sync(initialize_hashrnd);
5821