]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/core/dev.c
net: do not initialise statics to 0
[karo-tx-linux.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <net/busy_poll.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/stat.h>
102 #include <net/dst.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
119 #include <linux/if_arp.h>
120 #include <linux/if_vlan.h>
121 #include <linux/ip.h>
122 #include <net/ip.h>
123 #include <net/mpls.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141 #include <linux/sctp.h>
142 #include <linux/crash_dump.h>
143
144 #include "net-sysfs.h"
145
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly;       /* Taps */
156 static struct list_head offload_base __read_mostly;
157
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160                                          struct net_device *dev,
161                                          struct netdev_notifier_info *info);
162
163 /*
164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165  * semaphore.
166  *
167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168  *
169  * Writers must hold the rtnl semaphore while they loop through the
170  * dev_base_head list, and hold dev_base_lock for writing when they do the
171  * actual updates.  This allows pure readers to access the list even
172  * while a writer is preparing to update it.
173  *
174  * To put it another way, dev_base_lock is held for writing only to
175  * protect against pure readers; the rtnl semaphore provides the
176  * protection against other writers.
177  *
178  * See, for example usages, register_netdevice() and
179  * unregister_netdevice(), which must be called with the rtnl
180  * semaphore held.
181  */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190
191 static seqcount_t devnet_rename_seq;
192
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195         while (++net->dev_base_seq == 0);
196 }
197
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
201
202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213         spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220         spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227         struct net *net = dev_net(dev);
228
229         ASSERT_RTNL();
230
231         write_lock_bh(&dev_base_lock);
232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234         hlist_add_head_rcu(&dev->index_hlist,
235                            dev_index_hash(net, dev->ifindex));
236         write_unlock_bh(&dev_base_lock);
237
238         dev_base_seq_inc(net);
239 }
240
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246         ASSERT_RTNL();
247
248         /* Unlink dev from the device chain */
249         write_lock_bh(&dev_base_lock);
250         list_del_rcu(&dev->dev_list);
251         hlist_del_rcu(&dev->name_hlist);
252         hlist_del_rcu(&dev->index_hlist);
253         write_unlock_bh(&dev_base_lock);
254
255         dev_base_seq_inc(dev_net(dev));
256 }
257
258 /*
259  *      Our notifier list
260  */
261
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263
264 /*
265  *      Device drivers call our routines to queue packets here. We empty the
266  *      queue in the local softnet handler.
267  */
268
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293
294 static const char *const netdev_lock_name[] =
295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316         int i;
317
318         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319                 if (netdev_lock_type[i] == dev_type)
320                         return i;
321         /* the last key is used by default */
322         return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326                                                  unsigned short dev_type)
327 {
328         int i;
329
330         i = netdev_lock_pos(dev_type);
331         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332                                    netdev_lock_name[i]);
333 }
334
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337         int i;
338
339         i = netdev_lock_pos(dev->type);
340         lockdep_set_class_and_name(&dev->addr_list_lock,
341                                    &netdev_addr_lock_key[i],
342                                    netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346                                                  unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353
354 /*******************************************************************************
355
356                 Protocol management and registration routines
357
358 *******************************************************************************/
359
360 /*
361  *      Add a protocol ID to the list. Now that the input handler is
362  *      smarter we can dispense with all the messy stuff that used to be
363  *      here.
364  *
365  *      BEWARE!!! Protocol handlers, mangling input packets,
366  *      MUST BE last in hash buckets and checking protocol handlers
367  *      MUST start from promiscuous ptype_all chain in net_bh.
368  *      It is true now, do not change it.
369  *      Explanation follows: if protocol handler, mangling packet, will
370  *      be the first on list, it is not able to sense, that packet
371  *      is cloned and should be copied-on-write, so that it will
372  *      change it and subsequent readers will get broken packet.
373  *                                                      --ANK (980803)
374  */
375
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378         if (pt->type == htons(ETH_P_ALL))
379                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380         else
381                 return pt->dev ? &pt->dev->ptype_specific :
382                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384
385 /**
386  *      dev_add_pack - add packet handler
387  *      @pt: packet type declaration
388  *
389  *      Add a protocol handler to the networking stack. The passed &packet_type
390  *      is linked into kernel lists and may not be freed until it has been
391  *      removed from the kernel lists.
392  *
393  *      This call does not sleep therefore it can not
394  *      guarantee all CPU's that are in middle of receiving packets
395  *      will see the new packet type (until the next received packet).
396  */
397
398 void dev_add_pack(struct packet_type *pt)
399 {
400         struct list_head *head = ptype_head(pt);
401
402         spin_lock(&ptype_lock);
403         list_add_rcu(&pt->list, head);
404         spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407
408 /**
409  *      __dev_remove_pack        - remove packet handler
410  *      @pt: packet type declaration
411  *
412  *      Remove a protocol handler that was previously added to the kernel
413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *      from the kernel lists and can be freed or reused once this function
415  *      returns.
416  *
417  *      The packet type might still be in use by receivers
418  *      and must not be freed until after all the CPU's have gone
419  *      through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423         struct list_head *head = ptype_head(pt);
424         struct packet_type *pt1;
425
426         spin_lock(&ptype_lock);
427
428         list_for_each_entry(pt1, head, list) {
429                 if (pt == pt1) {
430                         list_del_rcu(&pt->list);
431                         goto out;
432                 }
433         }
434
435         pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437         spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440
441 /**
442  *      dev_remove_pack  - remove packet handler
443  *      @pt: packet type declaration
444  *
445  *      Remove a protocol handler that was previously added to the kernel
446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *      from the kernel lists and can be freed or reused once this function
448  *      returns.
449  *
450  *      This call sleeps to guarantee that no CPU is looking at the packet
451  *      type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455         __dev_remove_pack(pt);
456
457         synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460
461
462 /**
463  *      dev_add_offload - register offload handlers
464  *      @po: protocol offload declaration
465  *
466  *      Add protocol offload handlers to the networking stack. The passed
467  *      &proto_offload is linked into kernel lists and may not be freed until
468  *      it has been removed from the kernel lists.
469  *
470  *      This call does not sleep therefore it can not
471  *      guarantee all CPU's that are in middle of receiving packets
472  *      will see the new offload handlers (until the next received packet).
473  */
474 void dev_add_offload(struct packet_offload *po)
475 {
476         struct packet_offload *elem;
477
478         spin_lock(&offload_lock);
479         list_for_each_entry(elem, &offload_base, list) {
480                 if (po->priority < elem->priority)
481                         break;
482         }
483         list_add_rcu(&po->list, elem->list.prev);
484         spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487
488 /**
489  *      __dev_remove_offload     - remove offload handler
490  *      @po: packet offload declaration
491  *
492  *      Remove a protocol offload handler that was previously added to the
493  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
494  *      is removed from the kernel lists and can be freed or reused once this
495  *      function returns.
496  *
497  *      The packet type might still be in use by receivers
498  *      and must not be freed until after all the CPU's have gone
499  *      through a quiescent state.
500  */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503         struct list_head *head = &offload_base;
504         struct packet_offload *po1;
505
506         spin_lock(&offload_lock);
507
508         list_for_each_entry(po1, head, list) {
509                 if (po == po1) {
510                         list_del_rcu(&po->list);
511                         goto out;
512                 }
513         }
514
515         pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517         spin_unlock(&offload_lock);
518 }
519
520 /**
521  *      dev_remove_offload       - remove packet offload handler
522  *      @po: packet offload declaration
523  *
524  *      Remove a packet offload handler that was previously added to the kernel
525  *      offload handlers by dev_add_offload(). The passed &offload_type is
526  *      removed from the kernel lists and can be freed or reused once this
527  *      function returns.
528  *
529  *      This call sleeps to guarantee that no CPU is looking at the packet
530  *      type after return.
531  */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534         __dev_remove_offload(po);
535
536         synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539
540 /******************************************************************************
541
542                       Device Boot-time Settings Routines
543
544 *******************************************************************************/
545
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548
549 /**
550  *      netdev_boot_setup_add   - add new setup entry
551  *      @name: name of the device
552  *      @map: configured settings for the device
553  *
554  *      Adds new setup entry to the dev_boot_setup list.  The function
555  *      returns 0 on error and 1 on success.  This is a generic routine to
556  *      all netdevices.
557  */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560         struct netdev_boot_setup *s;
561         int i;
562
563         s = dev_boot_setup;
564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566                         memset(s[i].name, 0, sizeof(s[i].name));
567                         strlcpy(s[i].name, name, IFNAMSIZ);
568                         memcpy(&s[i].map, map, sizeof(s[i].map));
569                         break;
570                 }
571         }
572
573         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575
576 /**
577  *      netdev_boot_setup_check - check boot time settings
578  *      @dev: the netdevice
579  *
580  *      Check boot time settings for the device.
581  *      The found settings are set for the device to be used
582  *      later in the device probing.
583  *      Returns 0 if no settings found, 1 if they are.
584  */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587         struct netdev_boot_setup *s = dev_boot_setup;
588         int i;
589
590         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592                     !strcmp(dev->name, s[i].name)) {
593                         dev->irq        = s[i].map.irq;
594                         dev->base_addr  = s[i].map.base_addr;
595                         dev->mem_start  = s[i].map.mem_start;
596                         dev->mem_end    = s[i].map.mem_end;
597                         return 1;
598                 }
599         }
600         return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603
604
605 /**
606  *      netdev_boot_base        - get address from boot time settings
607  *      @prefix: prefix for network device
608  *      @unit: id for network device
609  *
610  *      Check boot time settings for the base address of device.
611  *      The found settings are set for the device to be used
612  *      later in the device probing.
613  *      Returns 0 if no settings found.
614  */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617         const struct netdev_boot_setup *s = dev_boot_setup;
618         char name[IFNAMSIZ];
619         int i;
620
621         sprintf(name, "%s%d", prefix, unit);
622
623         /*
624          * If device already registered then return base of 1
625          * to indicate not to probe for this interface
626          */
627         if (__dev_get_by_name(&init_net, name))
628                 return 1;
629
630         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631                 if (!strcmp(name, s[i].name))
632                         return s[i].map.base_addr;
633         return 0;
634 }
635
636 /*
637  * Saves at boot time configured settings for any netdevice.
638  */
639 int __init netdev_boot_setup(char *str)
640 {
641         int ints[5];
642         struct ifmap map;
643
644         str = get_options(str, ARRAY_SIZE(ints), ints);
645         if (!str || !*str)
646                 return 0;
647
648         /* Save settings */
649         memset(&map, 0, sizeof(map));
650         if (ints[0] > 0)
651                 map.irq = ints[1];
652         if (ints[0] > 1)
653                 map.base_addr = ints[2];
654         if (ints[0] > 2)
655                 map.mem_start = ints[3];
656         if (ints[0] > 3)
657                 map.mem_end = ints[4];
658
659         /* Add new entry to the list */
660         return netdev_boot_setup_add(str, &map);
661 }
662
663 __setup("netdev=", netdev_boot_setup);
664
665 /*******************************************************************************
666
667                             Device Interface Subroutines
668
669 *******************************************************************************/
670
671 /**
672  *      dev_get_iflink  - get 'iflink' value of a interface
673  *      @dev: targeted interface
674  *
675  *      Indicates the ifindex the interface is linked to.
676  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
677  */
678
679 int dev_get_iflink(const struct net_device *dev)
680 {
681         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682                 return dev->netdev_ops->ndo_get_iflink(dev);
683
684         return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687
688 /**
689  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
690  *      @dev: targeted interface
691  *      @skb: The packet.
692  *
693  *      For better visibility of tunnel traffic OVS needs to retrieve
694  *      egress tunnel information for a packet. Following API allows
695  *      user to get this info.
696  */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699         struct ip_tunnel_info *info;
700
701         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
702                 return -EINVAL;
703
704         info = skb_tunnel_info_unclone(skb);
705         if (!info)
706                 return -ENOMEM;
707         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708                 return -EINVAL;
709
710         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713
714 /**
715  *      __dev_get_by_name       - find a device by its name
716  *      @net: the applicable net namespace
717  *      @name: name to find
718  *
719  *      Find an interface by name. Must be called under RTNL semaphore
720  *      or @dev_base_lock. If the name is found a pointer to the device
721  *      is returned. If the name is not found then %NULL is returned. The
722  *      reference counters are not incremented so the caller must be
723  *      careful with locks.
724  */
725
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728         struct net_device *dev;
729         struct hlist_head *head = dev_name_hash(net, name);
730
731         hlist_for_each_entry(dev, head, name_hlist)
732                 if (!strncmp(dev->name, name, IFNAMSIZ))
733                         return dev;
734
735         return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738
739 /**
740  *      dev_get_by_name_rcu     - find a device by its name
741  *      @net: the applicable net namespace
742  *      @name: name to find
743  *
744  *      Find an interface by name.
745  *      If the name is found a pointer to the device is returned.
746  *      If the name is not found then %NULL is returned.
747  *      The reference counters are not incremented so the caller must be
748  *      careful with locks. The caller must hold RCU lock.
749  */
750
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753         struct net_device *dev;
754         struct hlist_head *head = dev_name_hash(net, name);
755
756         hlist_for_each_entry_rcu(dev, head, name_hlist)
757                 if (!strncmp(dev->name, name, IFNAMSIZ))
758                         return dev;
759
760         return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763
764 /**
765  *      dev_get_by_name         - find a device by its name
766  *      @net: the applicable net namespace
767  *      @name: name to find
768  *
769  *      Find an interface by name. This can be called from any
770  *      context and does its own locking. The returned handle has
771  *      the usage count incremented and the caller must use dev_put() to
772  *      release it when it is no longer needed. %NULL is returned if no
773  *      matching device is found.
774  */
775
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778         struct net_device *dev;
779
780         rcu_read_lock();
781         dev = dev_get_by_name_rcu(net, name);
782         if (dev)
783                 dev_hold(dev);
784         rcu_read_unlock();
785         return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788
789 /**
790  *      __dev_get_by_index - find a device by its ifindex
791  *      @net: the applicable net namespace
792  *      @ifindex: index of device
793  *
794  *      Search for an interface by index. Returns %NULL if the device
795  *      is not found or a pointer to the device. The device has not
796  *      had its reference counter increased so the caller must be careful
797  *      about locking. The caller must hold either the RTNL semaphore
798  *      or @dev_base_lock.
799  */
800
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803         struct net_device *dev;
804         struct hlist_head *head = dev_index_hash(net, ifindex);
805
806         hlist_for_each_entry(dev, head, index_hlist)
807                 if (dev->ifindex == ifindex)
808                         return dev;
809
810         return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813
814 /**
815  *      dev_get_by_index_rcu - find a device by its ifindex
816  *      @net: the applicable net namespace
817  *      @ifindex: index of device
818  *
819  *      Search for an interface by index. Returns %NULL if the device
820  *      is not found or a pointer to the device. The device has not
821  *      had its reference counter increased so the caller must be careful
822  *      about locking. The caller must hold RCU lock.
823  */
824
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827         struct net_device *dev;
828         struct hlist_head *head = dev_index_hash(net, ifindex);
829
830         hlist_for_each_entry_rcu(dev, head, index_hlist)
831                 if (dev->ifindex == ifindex)
832                         return dev;
833
834         return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837
838
839 /**
840  *      dev_get_by_index - find a device by its ifindex
841  *      @net: the applicable net namespace
842  *      @ifindex: index of device
843  *
844  *      Search for an interface by index. Returns NULL if the device
845  *      is not found or a pointer to the device. The device returned has
846  *      had a reference added and the pointer is safe until the user calls
847  *      dev_put to indicate they have finished with it.
848  */
849
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852         struct net_device *dev;
853
854         rcu_read_lock();
855         dev = dev_get_by_index_rcu(net, ifindex);
856         if (dev)
857                 dev_hold(dev);
858         rcu_read_unlock();
859         return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862
863 /**
864  *      netdev_get_name - get a netdevice name, knowing its ifindex.
865  *      @net: network namespace
866  *      @name: a pointer to the buffer where the name will be stored.
867  *      @ifindex: the ifindex of the interface to get the name from.
868  *
869  *      The use of raw_seqcount_begin() and cond_resched() before
870  *      retrying is required as we want to give the writers a chance
871  *      to complete when CONFIG_PREEMPT is not set.
872  */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875         struct net_device *dev;
876         unsigned int seq;
877
878 retry:
879         seq = raw_seqcount_begin(&devnet_rename_seq);
880         rcu_read_lock();
881         dev = dev_get_by_index_rcu(net, ifindex);
882         if (!dev) {
883                 rcu_read_unlock();
884                 return -ENODEV;
885         }
886
887         strcpy(name, dev->name);
888         rcu_read_unlock();
889         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890                 cond_resched();
891                 goto retry;
892         }
893
894         return 0;
895 }
896
897 /**
898  *      dev_getbyhwaddr_rcu - find a device by its hardware address
899  *      @net: the applicable net namespace
900  *      @type: media type of device
901  *      @ha: hardware address
902  *
903  *      Search for an interface by MAC address. Returns NULL if the device
904  *      is not found or a pointer to the device.
905  *      The caller must hold RCU or RTNL.
906  *      The returned device has not had its ref count increased
907  *      and the caller must therefore be careful about locking
908  *
909  */
910
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912                                        const char *ha)
913 {
914         struct net_device *dev;
915
916         for_each_netdev_rcu(net, dev)
917                 if (dev->type == type &&
918                     !memcmp(dev->dev_addr, ha, dev->addr_len))
919                         return dev;
920
921         return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927         struct net_device *dev;
928
929         ASSERT_RTNL();
930         for_each_netdev(net, dev)
931                 if (dev->type == type)
932                         return dev;
933
934         return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940         struct net_device *dev, *ret = NULL;
941
942         rcu_read_lock();
943         for_each_netdev_rcu(net, dev)
944                 if (dev->type == type) {
945                         dev_hold(dev);
946                         ret = dev;
947                         break;
948                 }
949         rcu_read_unlock();
950         return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953
954 /**
955  *      __dev_get_by_flags - find any device with given flags
956  *      @net: the applicable net namespace
957  *      @if_flags: IFF_* values
958  *      @mask: bitmask of bits in if_flags to check
959  *
960  *      Search for any interface with the given flags. Returns NULL if a device
961  *      is not found or a pointer to the device. Must be called inside
962  *      rtnl_lock(), and result refcount is unchanged.
963  */
964
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966                                       unsigned short mask)
967 {
968         struct net_device *dev, *ret;
969
970         ASSERT_RTNL();
971
972         ret = NULL;
973         for_each_netdev(net, dev) {
974                 if (((dev->flags ^ if_flags) & mask) == 0) {
975                         ret = dev;
976                         break;
977                 }
978         }
979         return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982
983 /**
984  *      dev_valid_name - check if name is okay for network device
985  *      @name: name string
986  *
987  *      Network device names need to be valid file names to
988  *      to allow sysfs to work.  We also disallow any kind of
989  *      whitespace.
990  */
991 bool dev_valid_name(const char *name)
992 {
993         if (*name == '\0')
994                 return false;
995         if (strlen(name) >= IFNAMSIZ)
996                 return false;
997         if (!strcmp(name, ".") || !strcmp(name, ".."))
998                 return false;
999
1000         while (*name) {
1001                 if (*name == '/' || *name == ':' || isspace(*name))
1002                         return false;
1003                 name++;
1004         }
1005         return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008
1009 /**
1010  *      __dev_alloc_name - allocate a name for a device
1011  *      @net: network namespace to allocate the device name in
1012  *      @name: name format string
1013  *      @buf:  scratch buffer and result name string
1014  *
1015  *      Passed a format string - eg "lt%d" it will try and find a suitable
1016  *      id. It scans list of devices to build up a free map, then chooses
1017  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *      while allocating the name and adding the device in order to avoid
1019  *      duplicates.
1020  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *      Returns the number of the unit assigned or a negative errno code.
1022  */
1023
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026         int i = 0;
1027         const char *p;
1028         const int max_netdevices = 8*PAGE_SIZE;
1029         unsigned long *inuse;
1030         struct net_device *d;
1031
1032         p = strnchr(name, IFNAMSIZ-1, '%');
1033         if (p) {
1034                 /*
1035                  * Verify the string as this thing may have come from
1036                  * the user.  There must be either one "%d" and no other "%"
1037                  * characters.
1038                  */
1039                 if (p[1] != 'd' || strchr(p + 2, '%'))
1040                         return -EINVAL;
1041
1042                 /* Use one page as a bit array of possible slots */
1043                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044                 if (!inuse)
1045                         return -ENOMEM;
1046
1047                 for_each_netdev(net, d) {
1048                         if (!sscanf(d->name, name, &i))
1049                                 continue;
1050                         if (i < 0 || i >= max_netdevices)
1051                                 continue;
1052
1053                         /*  avoid cases where sscanf is not exact inverse of printf */
1054                         snprintf(buf, IFNAMSIZ, name, i);
1055                         if (!strncmp(buf, d->name, IFNAMSIZ))
1056                                 set_bit(i, inuse);
1057                 }
1058
1059                 i = find_first_zero_bit(inuse, max_netdevices);
1060                 free_page((unsigned long) inuse);
1061         }
1062
1063         if (buf != name)
1064                 snprintf(buf, IFNAMSIZ, name, i);
1065         if (!__dev_get_by_name(net, buf))
1066                 return i;
1067
1068         /* It is possible to run out of possible slots
1069          * when the name is long and there isn't enough space left
1070          * for the digits, or if all bits are used.
1071          */
1072         return -ENFILE;
1073 }
1074
1075 /**
1076  *      dev_alloc_name - allocate a name for a device
1077  *      @dev: device
1078  *      @name: name format string
1079  *
1080  *      Passed a format string - eg "lt%d" it will try and find a suitable
1081  *      id. It scans list of devices to build up a free map, then chooses
1082  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *      while allocating the name and adding the device in order to avoid
1084  *      duplicates.
1085  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *      Returns the number of the unit assigned or a negative errno code.
1087  */
1088
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091         char buf[IFNAMSIZ];
1092         struct net *net;
1093         int ret;
1094
1095         BUG_ON(!dev_net(dev));
1096         net = dev_net(dev);
1097         ret = __dev_alloc_name(net, name, buf);
1098         if (ret >= 0)
1099                 strlcpy(dev->name, buf, IFNAMSIZ);
1100         return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103
1104 static int dev_alloc_name_ns(struct net *net,
1105                              struct net_device *dev,
1106                              const char *name)
1107 {
1108         char buf[IFNAMSIZ];
1109         int ret;
1110
1111         ret = __dev_alloc_name(net, name, buf);
1112         if (ret >= 0)
1113                 strlcpy(dev->name, buf, IFNAMSIZ);
1114         return ret;
1115 }
1116
1117 static int dev_get_valid_name(struct net *net,
1118                               struct net_device *dev,
1119                               const char *name)
1120 {
1121         BUG_ON(!net);
1122
1123         if (!dev_valid_name(name))
1124                 return -EINVAL;
1125
1126         if (strchr(name, '%'))
1127                 return dev_alloc_name_ns(net, dev, name);
1128         else if (__dev_get_by_name(net, name))
1129                 return -EEXIST;
1130         else if (dev->name != name)
1131                 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133         return 0;
1134 }
1135
1136 /**
1137  *      dev_change_name - change name of a device
1138  *      @dev: device
1139  *      @newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *      Change name of a device, can pass format strings "eth%d".
1142  *      for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146         unsigned char old_assign_type;
1147         char oldname[IFNAMSIZ];
1148         int err = 0;
1149         int ret;
1150         struct net *net;
1151
1152         ASSERT_RTNL();
1153         BUG_ON(!dev_net(dev));
1154
1155         net = dev_net(dev);
1156         if (dev->flags & IFF_UP)
1157                 return -EBUSY;
1158
1159         write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162                 write_seqcount_end(&devnet_rename_seq);
1163                 return 0;
1164         }
1165
1166         memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168         err = dev_get_valid_name(net, dev, newname);
1169         if (err < 0) {
1170                 write_seqcount_end(&devnet_rename_seq);
1171                 return err;
1172         }
1173
1174         if (oldname[0] && !strchr(oldname, '%'))
1175                 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177         old_assign_type = dev->name_assign_type;
1178         dev->name_assign_type = NET_NAME_RENAMED;
1179
1180 rollback:
1181         ret = device_rename(&dev->dev, dev->name);
1182         if (ret) {
1183                 memcpy(dev->name, oldname, IFNAMSIZ);
1184                 dev->name_assign_type = old_assign_type;
1185                 write_seqcount_end(&devnet_rename_seq);
1186                 return ret;
1187         }
1188
1189         write_seqcount_end(&devnet_rename_seq);
1190
1191         netdev_adjacent_rename_links(dev, oldname);
1192
1193         write_lock_bh(&dev_base_lock);
1194         hlist_del_rcu(&dev->name_hlist);
1195         write_unlock_bh(&dev_base_lock);
1196
1197         synchronize_rcu();
1198
1199         write_lock_bh(&dev_base_lock);
1200         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201         write_unlock_bh(&dev_base_lock);
1202
1203         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204         ret = notifier_to_errno(ret);
1205
1206         if (ret) {
1207                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208                 if (err >= 0) {
1209                         err = ret;
1210                         write_seqcount_begin(&devnet_rename_seq);
1211                         memcpy(dev->name, oldname, IFNAMSIZ);
1212                         memcpy(oldname, newname, IFNAMSIZ);
1213                         dev->name_assign_type = old_assign_type;
1214                         old_assign_type = NET_NAME_RENAMED;
1215                         goto rollback;
1216                 } else {
1217                         pr_err("%s: name change rollback failed: %d\n",
1218                                dev->name, ret);
1219                 }
1220         }
1221
1222         return err;
1223 }
1224
1225 /**
1226  *      dev_set_alias - change ifalias of a device
1227  *      @dev: device
1228  *      @alias: name up to IFALIASZ
1229  *      @len: limit of bytes to copy from info
1230  *
1231  *      Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235         char *new_ifalias;
1236
1237         ASSERT_RTNL();
1238
1239         if (len >= IFALIASZ)
1240                 return -EINVAL;
1241
1242         if (!len) {
1243                 kfree(dev->ifalias);
1244                 dev->ifalias = NULL;
1245                 return 0;
1246         }
1247
1248         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249         if (!new_ifalias)
1250                 return -ENOMEM;
1251         dev->ifalias = new_ifalias;
1252
1253         strlcpy(dev->ifalias, alias, len+1);
1254         return len;
1255 }
1256
1257
1258 /**
1259  *      netdev_features_change - device changes features
1260  *      @dev: device to cause notification
1261  *
1262  *      Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269
1270 /**
1271  *      netdev_state_change - device changes state
1272  *      @dev: device to cause notification
1273  *
1274  *      Called to indicate a device has changed state. This function calls
1275  *      the notifier chains for netdev_chain and sends a NEWLINK message
1276  *      to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280         if (dev->flags & IFF_UP) {
1281                 struct netdev_notifier_change_info change_info;
1282
1283                 change_info.flags_changed = 0;
1284                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285                                               &change_info.info);
1286                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287         }
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290
1291 /**
1292  *      netdev_notify_peers - notify network peers about existence of @dev
1293  *      @dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303         rtnl_lock();
1304         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305         rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308
1309 static int __dev_open(struct net_device *dev)
1310 {
1311         const struct net_device_ops *ops = dev->netdev_ops;
1312         int ret;
1313
1314         ASSERT_RTNL();
1315
1316         if (!netif_device_present(dev))
1317                 return -ENODEV;
1318
1319         /* Block netpoll from trying to do any rx path servicing.
1320          * If we don't do this there is a chance ndo_poll_controller
1321          * or ndo_poll may be running while we open the device
1322          */
1323         netpoll_poll_disable(dev);
1324
1325         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326         ret = notifier_to_errno(ret);
1327         if (ret)
1328                 return ret;
1329
1330         set_bit(__LINK_STATE_START, &dev->state);
1331
1332         if (ops->ndo_validate_addr)
1333                 ret = ops->ndo_validate_addr(dev);
1334
1335         if (!ret && ops->ndo_open)
1336                 ret = ops->ndo_open(dev);
1337
1338         netpoll_poll_enable(dev);
1339
1340         if (ret)
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342         else {
1343                 dev->flags |= IFF_UP;
1344                 dev_set_rx_mode(dev);
1345                 dev_activate(dev);
1346                 add_device_randomness(dev->dev_addr, dev->addr_len);
1347         }
1348
1349         return ret;
1350 }
1351
1352 /**
1353  *      dev_open        - prepare an interface for use.
1354  *      @dev:   device to open
1355  *
1356  *      Takes a device from down to up state. The device's private open
1357  *      function is invoked and then the multicast lists are loaded. Finally
1358  *      the device is moved into the up state and a %NETDEV_UP message is
1359  *      sent to the netdev notifier chain.
1360  *
1361  *      Calling this function on an active interface is a nop. On a failure
1362  *      a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366         int ret;
1367
1368         if (dev->flags & IFF_UP)
1369                 return 0;
1370
1371         ret = __dev_open(dev);
1372         if (ret < 0)
1373                 return ret;
1374
1375         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376         call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378         return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384         struct net_device *dev;
1385
1386         ASSERT_RTNL();
1387         might_sleep();
1388
1389         list_for_each_entry(dev, head, close_list) {
1390                 /* Temporarily disable netpoll until the interface is down */
1391                 netpoll_poll_disable(dev);
1392
1393                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395                 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398                  * can be even on different cpu. So just clear netif_running().
1399                  *
1400                  * dev->stop() will invoke napi_disable() on all of it's
1401                  * napi_struct instances on this device.
1402                  */
1403                 smp_mb__after_atomic(); /* Commit netif_running(). */
1404         }
1405
1406         dev_deactivate_many(head);
1407
1408         list_for_each_entry(dev, head, close_list) {
1409                 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411                 /*
1412                  *      Call the device specific close. This cannot fail.
1413                  *      Only if device is UP
1414                  *
1415                  *      We allow it to be called even after a DETACH hot-plug
1416                  *      event.
1417                  */
1418                 if (ops->ndo_stop)
1419                         ops->ndo_stop(dev);
1420
1421                 dev->flags &= ~IFF_UP;
1422                 netpoll_poll_enable(dev);
1423         }
1424
1425         return 0;
1426 }
1427
1428 static int __dev_close(struct net_device *dev)
1429 {
1430         int retval;
1431         LIST_HEAD(single);
1432
1433         list_add(&dev->close_list, &single);
1434         retval = __dev_close_many(&single);
1435         list_del(&single);
1436
1437         return retval;
1438 }
1439
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442         struct net_device *dev, *tmp;
1443
1444         /* Remove the devices that don't need to be closed */
1445         list_for_each_entry_safe(dev, tmp, head, close_list)
1446                 if (!(dev->flags & IFF_UP))
1447                         list_del_init(&dev->close_list);
1448
1449         __dev_close_many(head);
1450
1451         list_for_each_entry_safe(dev, tmp, head, close_list) {
1452                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454                 if (unlink)
1455                         list_del_init(&dev->close_list);
1456         }
1457
1458         return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461
1462 /**
1463  *      dev_close - shutdown an interface.
1464  *      @dev: device to shutdown
1465  *
1466  *      This function moves an active device into down state. A
1467  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *      chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473         if (dev->flags & IFF_UP) {
1474                 LIST_HEAD(single);
1475
1476                 list_add(&dev->close_list, &single);
1477                 dev_close_many(&single, true);
1478                 list_del(&single);
1479         }
1480         return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483
1484
1485 /**
1486  *      dev_disable_lro - disable Large Receive Offload on a device
1487  *      @dev: device
1488  *
1489  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *      called under RTNL.  This is needed if received packets may be
1491  *      forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495         struct net_device *lower_dev;
1496         struct list_head *iter;
1497
1498         dev->wanted_features &= ~NETIF_F_LRO;
1499         netdev_update_features(dev);
1500
1501         if (unlikely(dev->features & NETIF_F_LRO))
1502                 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504         netdev_for_each_lower_dev(dev, lower_dev, iter)
1505                 dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510                                    struct net_device *dev)
1511 {
1512         struct netdev_notifier_info info;
1513
1514         netdev_notifier_info_init(&info, dev);
1515         return nb->notifier_call(nb, val, &info);
1516 }
1517
1518 static int dev_boot_phase = 1;
1519
1520 /**
1521  *      register_netdevice_notifier - register a network notifier block
1522  *      @nb: notifier
1523  *
1524  *      Register a notifier to be called when network device events occur.
1525  *      The notifier passed is linked into the kernel structures and must
1526  *      not be reused until it has been unregistered. A negative errno code
1527  *      is returned on a failure.
1528  *
1529  *      When registered all registration and up events are replayed
1530  *      to the new notifier to allow device to have a race free
1531  *      view of the network device list.
1532  */
1533
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536         struct net_device *dev;
1537         struct net_device *last;
1538         struct net *net;
1539         int err;
1540
1541         rtnl_lock();
1542         err = raw_notifier_chain_register(&netdev_chain, nb);
1543         if (err)
1544                 goto unlock;
1545         if (dev_boot_phase)
1546                 goto unlock;
1547         for_each_net(net) {
1548                 for_each_netdev(net, dev) {
1549                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550                         err = notifier_to_errno(err);
1551                         if (err)
1552                                 goto rollback;
1553
1554                         if (!(dev->flags & IFF_UP))
1555                                 continue;
1556
1557                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1558                 }
1559         }
1560
1561 unlock:
1562         rtnl_unlock();
1563         return err;
1564
1565 rollback:
1566         last = dev;
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev == last)
1570                                 goto outroll;
1571
1572                         if (dev->flags & IFF_UP) {
1573                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574                                                         dev);
1575                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576                         }
1577                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578                 }
1579         }
1580
1581 outroll:
1582         raw_notifier_chain_unregister(&netdev_chain, nb);
1583         goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587 /**
1588  *      unregister_netdevice_notifier - unregister a network notifier block
1589  *      @nb: notifier
1590  *
1591  *      Unregister a notifier previously registered by
1592  *      register_netdevice_notifier(). The notifier is unlinked into the
1593  *      kernel structures and may then be reused. A negative errno code
1594  *      is returned on a failure.
1595  *
1596  *      After unregistering unregister and down device events are synthesized
1597  *      for all devices on the device list to the removed notifier to remove
1598  *      the need for special case cleanup code.
1599  */
1600
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603         struct net_device *dev;
1604         struct net *net;
1605         int err;
1606
1607         rtnl_lock();
1608         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609         if (err)
1610                 goto unlock;
1611
1612         for_each_net(net) {
1613                 for_each_netdev(net, dev) {
1614                         if (dev->flags & IFF_UP) {
1615                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616                                                         dev);
1617                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618                         }
1619                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620                 }
1621         }
1622 unlock:
1623         rtnl_unlock();
1624         return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628 /**
1629  *      call_netdevice_notifiers_info - call all network notifier blocks
1630  *      @val: value passed unmodified to notifier function
1631  *      @dev: net_device pointer passed unmodified to notifier function
1632  *      @info: notifier information data
1633  *
1634  *      Call all network notifier blocks.  Parameters and return value
1635  *      are as for raw_notifier_call_chain().
1636  */
1637
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639                                          struct net_device *dev,
1640                                          struct netdev_notifier_info *info)
1641 {
1642         ASSERT_RTNL();
1643         netdev_notifier_info_init(info, dev);
1644         return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646
1647 /**
1648  *      call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *      Call all network notifier blocks.  Parameters and return value
1653  *      are as for raw_notifier_call_chain().
1654  */
1655
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658         struct netdev_notifier_info info;
1659
1660         return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666
1667 void net_inc_ingress_queue(void)
1668 {
1669         static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673 void net_dec_ingress_queue(void)
1674 {
1675         static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682
1683 void net_inc_egress_queue(void)
1684 {
1685         static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689 void net_dec_egress_queue(void)
1690 {
1691         static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709
1710         if (deferred) {
1711                 while (--deferred)
1712                         static_key_slow_dec(&netstamp_needed);
1713                 return;
1714         }
1715 #endif
1716         static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723         if (in_interrupt()) {
1724                 atomic_inc(&netstamp_needed_deferred);
1725                 return;
1726         }
1727 #endif
1728         static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734         skb->tstamp.tv64 = 0;
1735         if (static_key_false(&netstamp_needed))
1736                 __net_timestamp(skb);
1737 }
1738
1739 #define net_timestamp_check(COND, SKB)                  \
1740         if (static_key_false(&netstamp_needed)) {               \
1741                 if ((COND) && !(SKB)->tstamp.tv64)      \
1742                         __net_timestamp(SKB);           \
1743         }                                               \
1744
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747         unsigned int len;
1748
1749         if (!(dev->flags & IFF_UP))
1750                 return false;
1751
1752         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753         if (skb->len <= len)
1754                 return true;
1755
1756         /* if TSO is enabled, we don't care about the length as the packet
1757          * could be forwarded without being segmented before
1758          */
1759         if (skb_is_gso(skb))
1760                 return true;
1761
1762         return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1769             unlikely(!is_skb_forwardable(dev, skb))) {
1770                 atomic_long_inc(&dev->rx_dropped);
1771                 kfree_skb(skb);
1772                 return NET_RX_DROP;
1773         }
1774
1775         skb_scrub_packet(skb, true);
1776         skb->priority = 0;
1777         skb->protocol = eth_type_trans(skb, dev);
1778         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1779
1780         return 0;
1781 }
1782 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1783
1784 /**
1785  * dev_forward_skb - loopback an skb to another netif
1786  *
1787  * @dev: destination network device
1788  * @skb: buffer to forward
1789  *
1790  * return values:
1791  *      NET_RX_SUCCESS  (no congestion)
1792  *      NET_RX_DROP     (packet was dropped, but freed)
1793  *
1794  * dev_forward_skb can be used for injecting an skb from the
1795  * start_xmit function of one device into the receive queue
1796  * of another device.
1797  *
1798  * The receiving device may be in another namespace, so
1799  * we have to clear all information in the skb that could
1800  * impact namespace isolation.
1801  */
1802 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1803 {
1804         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1805 }
1806 EXPORT_SYMBOL_GPL(dev_forward_skb);
1807
1808 static inline int deliver_skb(struct sk_buff *skb,
1809                               struct packet_type *pt_prev,
1810                               struct net_device *orig_dev)
1811 {
1812         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1813                 return -ENOMEM;
1814         atomic_inc(&skb->users);
1815         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1816 }
1817
1818 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1819                                           struct packet_type **pt,
1820                                           struct net_device *orig_dev,
1821                                           __be16 type,
1822                                           struct list_head *ptype_list)
1823 {
1824         struct packet_type *ptype, *pt_prev = *pt;
1825
1826         list_for_each_entry_rcu(ptype, ptype_list, list) {
1827                 if (ptype->type != type)
1828                         continue;
1829                 if (pt_prev)
1830                         deliver_skb(skb, pt_prev, orig_dev);
1831                 pt_prev = ptype;
1832         }
1833         *pt = pt_prev;
1834 }
1835
1836 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1837 {
1838         if (!ptype->af_packet_priv || !skb->sk)
1839                 return false;
1840
1841         if (ptype->id_match)
1842                 return ptype->id_match(ptype, skb->sk);
1843         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1844                 return true;
1845
1846         return false;
1847 }
1848
1849 /*
1850  *      Support routine. Sends outgoing frames to any network
1851  *      taps currently in use.
1852  */
1853
1854 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1855 {
1856         struct packet_type *ptype;
1857         struct sk_buff *skb2 = NULL;
1858         struct packet_type *pt_prev = NULL;
1859         struct list_head *ptype_list = &ptype_all;
1860
1861         rcu_read_lock();
1862 again:
1863         list_for_each_entry_rcu(ptype, ptype_list, list) {
1864                 /* Never send packets back to the socket
1865                  * they originated from - MvS (miquels@drinkel.ow.org)
1866                  */
1867                 if (skb_loop_sk(ptype, skb))
1868                         continue;
1869
1870                 if (pt_prev) {
1871                         deliver_skb(skb2, pt_prev, skb->dev);
1872                         pt_prev = ptype;
1873                         continue;
1874                 }
1875
1876                 /* need to clone skb, done only once */
1877                 skb2 = skb_clone(skb, GFP_ATOMIC);
1878                 if (!skb2)
1879                         goto out_unlock;
1880
1881                 net_timestamp_set(skb2);
1882
1883                 /* skb->nh should be correctly
1884                  * set by sender, so that the second statement is
1885                  * just protection against buggy protocols.
1886                  */
1887                 skb_reset_mac_header(skb2);
1888
1889                 if (skb_network_header(skb2) < skb2->data ||
1890                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1891                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1892                                              ntohs(skb2->protocol),
1893                                              dev->name);
1894                         skb_reset_network_header(skb2);
1895                 }
1896
1897                 skb2->transport_header = skb2->network_header;
1898                 skb2->pkt_type = PACKET_OUTGOING;
1899                 pt_prev = ptype;
1900         }
1901
1902         if (ptype_list == &ptype_all) {
1903                 ptype_list = &dev->ptype_all;
1904                 goto again;
1905         }
1906 out_unlock:
1907         if (pt_prev)
1908                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1909         rcu_read_unlock();
1910 }
1911 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1912
1913 /**
1914  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1915  * @dev: Network device
1916  * @txq: number of queues available
1917  *
1918  * If real_num_tx_queues is changed the tc mappings may no longer be
1919  * valid. To resolve this verify the tc mapping remains valid and if
1920  * not NULL the mapping. With no priorities mapping to this
1921  * offset/count pair it will no longer be used. In the worst case TC0
1922  * is invalid nothing can be done so disable priority mappings. If is
1923  * expected that drivers will fix this mapping if they can before
1924  * calling netif_set_real_num_tx_queues.
1925  */
1926 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1927 {
1928         int i;
1929         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1930
1931         /* If TC0 is invalidated disable TC mapping */
1932         if (tc->offset + tc->count > txq) {
1933                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1934                 dev->num_tc = 0;
1935                 return;
1936         }
1937
1938         /* Invalidated prio to tc mappings set to TC0 */
1939         for (i = 1; i < TC_BITMASK + 1; i++) {
1940                 int q = netdev_get_prio_tc_map(dev, i);
1941
1942                 tc = &dev->tc_to_txq[q];
1943                 if (tc->offset + tc->count > txq) {
1944                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1945                                 i, q);
1946                         netdev_set_prio_tc_map(dev, i, 0);
1947                 }
1948         }
1949 }
1950
1951 #ifdef CONFIG_XPS
1952 static DEFINE_MUTEX(xps_map_mutex);
1953 #define xmap_dereference(P)             \
1954         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1955
1956 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1957                                         int cpu, u16 index)
1958 {
1959         struct xps_map *map = NULL;
1960         int pos;
1961
1962         if (dev_maps)
1963                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1964
1965         for (pos = 0; map && pos < map->len; pos++) {
1966                 if (map->queues[pos] == index) {
1967                         if (map->len > 1) {
1968                                 map->queues[pos] = map->queues[--map->len];
1969                         } else {
1970                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1971                                 kfree_rcu(map, rcu);
1972                                 map = NULL;
1973                         }
1974                         break;
1975                 }
1976         }
1977
1978         return map;
1979 }
1980
1981 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1982 {
1983         struct xps_dev_maps *dev_maps;
1984         int cpu, i;
1985         bool active = false;
1986
1987         mutex_lock(&xps_map_mutex);
1988         dev_maps = xmap_dereference(dev->xps_maps);
1989
1990         if (!dev_maps)
1991                 goto out_no_maps;
1992
1993         for_each_possible_cpu(cpu) {
1994                 for (i = index; i < dev->num_tx_queues; i++) {
1995                         if (!remove_xps_queue(dev_maps, cpu, i))
1996                                 break;
1997                 }
1998                 if (i == dev->num_tx_queues)
1999                         active = true;
2000         }
2001
2002         if (!active) {
2003                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2004                 kfree_rcu(dev_maps, rcu);
2005         }
2006
2007         for (i = index; i < dev->num_tx_queues; i++)
2008                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2009                                              NUMA_NO_NODE);
2010
2011 out_no_maps:
2012         mutex_unlock(&xps_map_mutex);
2013 }
2014
2015 static struct xps_map *expand_xps_map(struct xps_map *map,
2016                                       int cpu, u16 index)
2017 {
2018         struct xps_map *new_map;
2019         int alloc_len = XPS_MIN_MAP_ALLOC;
2020         int i, pos;
2021
2022         for (pos = 0; map && pos < map->len; pos++) {
2023                 if (map->queues[pos] != index)
2024                         continue;
2025                 return map;
2026         }
2027
2028         /* Need to add queue to this CPU's existing map */
2029         if (map) {
2030                 if (pos < map->alloc_len)
2031                         return map;
2032
2033                 alloc_len = map->alloc_len * 2;
2034         }
2035
2036         /* Need to allocate new map to store queue on this CPU's map */
2037         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2038                                cpu_to_node(cpu));
2039         if (!new_map)
2040                 return NULL;
2041
2042         for (i = 0; i < pos; i++)
2043                 new_map->queues[i] = map->queues[i];
2044         new_map->alloc_len = alloc_len;
2045         new_map->len = pos;
2046
2047         return new_map;
2048 }
2049
2050 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2051                         u16 index)
2052 {
2053         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2054         struct xps_map *map, *new_map;
2055         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2056         int cpu, numa_node_id = -2;
2057         bool active = false;
2058
2059         mutex_lock(&xps_map_mutex);
2060
2061         dev_maps = xmap_dereference(dev->xps_maps);
2062
2063         /* allocate memory for queue storage */
2064         for_each_online_cpu(cpu) {
2065                 if (!cpumask_test_cpu(cpu, mask))
2066                         continue;
2067
2068                 if (!new_dev_maps)
2069                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2070                 if (!new_dev_maps) {
2071                         mutex_unlock(&xps_map_mutex);
2072                         return -ENOMEM;
2073                 }
2074
2075                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2076                                  NULL;
2077
2078                 map = expand_xps_map(map, cpu, index);
2079                 if (!map)
2080                         goto error;
2081
2082                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2083         }
2084
2085         if (!new_dev_maps)
2086                 goto out_no_new_maps;
2087
2088         for_each_possible_cpu(cpu) {
2089                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2090                         /* add queue to CPU maps */
2091                         int pos = 0;
2092
2093                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2094                         while ((pos < map->len) && (map->queues[pos] != index))
2095                                 pos++;
2096
2097                         if (pos == map->len)
2098                                 map->queues[map->len++] = index;
2099 #ifdef CONFIG_NUMA
2100                         if (numa_node_id == -2)
2101                                 numa_node_id = cpu_to_node(cpu);
2102                         else if (numa_node_id != cpu_to_node(cpu))
2103                                 numa_node_id = -1;
2104 #endif
2105                 } else if (dev_maps) {
2106                         /* fill in the new device map from the old device map */
2107                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2108                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2109                 }
2110
2111         }
2112
2113         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2114
2115         /* Cleanup old maps */
2116         if (dev_maps) {
2117                 for_each_possible_cpu(cpu) {
2118                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2119                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2120                         if (map && map != new_map)
2121                                 kfree_rcu(map, rcu);
2122                 }
2123
2124                 kfree_rcu(dev_maps, rcu);
2125         }
2126
2127         dev_maps = new_dev_maps;
2128         active = true;
2129
2130 out_no_new_maps:
2131         /* update Tx queue numa node */
2132         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2133                                      (numa_node_id >= 0) ? numa_node_id :
2134                                      NUMA_NO_NODE);
2135
2136         if (!dev_maps)
2137                 goto out_no_maps;
2138
2139         /* removes queue from unused CPUs */
2140         for_each_possible_cpu(cpu) {
2141                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2142                         continue;
2143
2144                 if (remove_xps_queue(dev_maps, cpu, index))
2145                         active = true;
2146         }
2147
2148         /* free map if not active */
2149         if (!active) {
2150                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2151                 kfree_rcu(dev_maps, rcu);
2152         }
2153
2154 out_no_maps:
2155         mutex_unlock(&xps_map_mutex);
2156
2157         return 0;
2158 error:
2159         /* remove any maps that we added */
2160         for_each_possible_cpu(cpu) {
2161                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2162                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2163                                  NULL;
2164                 if (new_map && new_map != map)
2165                         kfree(new_map);
2166         }
2167
2168         mutex_unlock(&xps_map_mutex);
2169
2170         kfree(new_dev_maps);
2171         return -ENOMEM;
2172 }
2173 EXPORT_SYMBOL(netif_set_xps_queue);
2174
2175 #endif
2176 /*
2177  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2178  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2179  */
2180 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2181 {
2182         int rc;
2183
2184         if (txq < 1 || txq > dev->num_tx_queues)
2185                 return -EINVAL;
2186
2187         if (dev->reg_state == NETREG_REGISTERED ||
2188             dev->reg_state == NETREG_UNREGISTERING) {
2189                 ASSERT_RTNL();
2190
2191                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2192                                                   txq);
2193                 if (rc)
2194                         return rc;
2195
2196                 if (dev->num_tc)
2197                         netif_setup_tc(dev, txq);
2198
2199                 if (txq < dev->real_num_tx_queues) {
2200                         qdisc_reset_all_tx_gt(dev, txq);
2201 #ifdef CONFIG_XPS
2202                         netif_reset_xps_queues_gt(dev, txq);
2203 #endif
2204                 }
2205         }
2206
2207         dev->real_num_tx_queues = txq;
2208         return 0;
2209 }
2210 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2211
2212 #ifdef CONFIG_SYSFS
2213 /**
2214  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2215  *      @dev: Network device
2216  *      @rxq: Actual number of RX queues
2217  *
2218  *      This must be called either with the rtnl_lock held or before
2219  *      registration of the net device.  Returns 0 on success, or a
2220  *      negative error code.  If called before registration, it always
2221  *      succeeds.
2222  */
2223 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2224 {
2225         int rc;
2226
2227         if (rxq < 1 || rxq > dev->num_rx_queues)
2228                 return -EINVAL;
2229
2230         if (dev->reg_state == NETREG_REGISTERED) {
2231                 ASSERT_RTNL();
2232
2233                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2234                                                   rxq);
2235                 if (rc)
2236                         return rc;
2237         }
2238
2239         dev->real_num_rx_queues = rxq;
2240         return 0;
2241 }
2242 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2243 #endif
2244
2245 /**
2246  * netif_get_num_default_rss_queues - default number of RSS queues
2247  *
2248  * This routine should set an upper limit on the number of RSS queues
2249  * used by default by multiqueue devices.
2250  */
2251 int netif_get_num_default_rss_queues(void)
2252 {
2253         return is_kdump_kernel() ?
2254                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2255 }
2256 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2257
2258 static void __netif_reschedule(struct Qdisc *q)
2259 {
2260         struct softnet_data *sd;
2261         unsigned long flags;
2262
2263         local_irq_save(flags);
2264         sd = this_cpu_ptr(&softnet_data);
2265         q->next_sched = NULL;
2266         *sd->output_queue_tailp = q;
2267         sd->output_queue_tailp = &q->next_sched;
2268         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2269         local_irq_restore(flags);
2270 }
2271
2272 void __netif_schedule(struct Qdisc *q)
2273 {
2274         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2275                 __netif_reschedule(q);
2276 }
2277 EXPORT_SYMBOL(__netif_schedule);
2278
2279 struct dev_kfree_skb_cb {
2280         enum skb_free_reason reason;
2281 };
2282
2283 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2284 {
2285         return (struct dev_kfree_skb_cb *)skb->cb;
2286 }
2287
2288 void netif_schedule_queue(struct netdev_queue *txq)
2289 {
2290         rcu_read_lock();
2291         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2292                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2293
2294                 __netif_schedule(q);
2295         }
2296         rcu_read_unlock();
2297 }
2298 EXPORT_SYMBOL(netif_schedule_queue);
2299
2300 /**
2301  *      netif_wake_subqueue - allow sending packets on subqueue
2302  *      @dev: network device
2303  *      @queue_index: sub queue index
2304  *
2305  * Resume individual transmit queue of a device with multiple transmit queues.
2306  */
2307 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2308 {
2309         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2310
2311         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2312                 struct Qdisc *q;
2313
2314                 rcu_read_lock();
2315                 q = rcu_dereference(txq->qdisc);
2316                 __netif_schedule(q);
2317                 rcu_read_unlock();
2318         }
2319 }
2320 EXPORT_SYMBOL(netif_wake_subqueue);
2321
2322 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2323 {
2324         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2325                 struct Qdisc *q;
2326
2327                 rcu_read_lock();
2328                 q = rcu_dereference(dev_queue->qdisc);
2329                 __netif_schedule(q);
2330                 rcu_read_unlock();
2331         }
2332 }
2333 EXPORT_SYMBOL(netif_tx_wake_queue);
2334
2335 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2336 {
2337         unsigned long flags;
2338
2339         if (likely(atomic_read(&skb->users) == 1)) {
2340                 smp_rmb();
2341                 atomic_set(&skb->users, 0);
2342         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2343                 return;
2344         }
2345         get_kfree_skb_cb(skb)->reason = reason;
2346         local_irq_save(flags);
2347         skb->next = __this_cpu_read(softnet_data.completion_queue);
2348         __this_cpu_write(softnet_data.completion_queue, skb);
2349         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2350         local_irq_restore(flags);
2351 }
2352 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2353
2354 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2355 {
2356         if (in_irq() || irqs_disabled())
2357                 __dev_kfree_skb_irq(skb, reason);
2358         else
2359                 dev_kfree_skb(skb);
2360 }
2361 EXPORT_SYMBOL(__dev_kfree_skb_any);
2362
2363
2364 /**
2365  * netif_device_detach - mark device as removed
2366  * @dev: network device
2367  *
2368  * Mark device as removed from system and therefore no longer available.
2369  */
2370 void netif_device_detach(struct net_device *dev)
2371 {
2372         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2373             netif_running(dev)) {
2374                 netif_tx_stop_all_queues(dev);
2375         }
2376 }
2377 EXPORT_SYMBOL(netif_device_detach);
2378
2379 /**
2380  * netif_device_attach - mark device as attached
2381  * @dev: network device
2382  *
2383  * Mark device as attached from system and restart if needed.
2384  */
2385 void netif_device_attach(struct net_device *dev)
2386 {
2387         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2388             netif_running(dev)) {
2389                 netif_tx_wake_all_queues(dev);
2390                 __netdev_watchdog_up(dev);
2391         }
2392 }
2393 EXPORT_SYMBOL(netif_device_attach);
2394
2395 /*
2396  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2397  * to be used as a distribution range.
2398  */
2399 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2400                   unsigned int num_tx_queues)
2401 {
2402         u32 hash;
2403         u16 qoffset = 0;
2404         u16 qcount = num_tx_queues;
2405
2406         if (skb_rx_queue_recorded(skb)) {
2407                 hash = skb_get_rx_queue(skb);
2408                 while (unlikely(hash >= num_tx_queues))
2409                         hash -= num_tx_queues;
2410                 return hash;
2411         }
2412
2413         if (dev->num_tc) {
2414                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2415                 qoffset = dev->tc_to_txq[tc].offset;
2416                 qcount = dev->tc_to_txq[tc].count;
2417         }
2418
2419         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2420 }
2421 EXPORT_SYMBOL(__skb_tx_hash);
2422
2423 static void skb_warn_bad_offload(const struct sk_buff *skb)
2424 {
2425         static const netdev_features_t null_features;
2426         struct net_device *dev = skb->dev;
2427         const char *name = "";
2428
2429         if (!net_ratelimit())
2430                 return;
2431
2432         if (dev) {
2433                 if (dev->dev.parent)
2434                         name = dev_driver_string(dev->dev.parent);
2435                 else
2436                         name = netdev_name(dev);
2437         }
2438         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2439              "gso_type=%d ip_summed=%d\n",
2440              name, dev ? &dev->features : &null_features,
2441              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2442              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2443              skb_shinfo(skb)->gso_type, skb->ip_summed);
2444 }
2445
2446 /*
2447  * Invalidate hardware checksum when packet is to be mangled, and
2448  * complete checksum manually on outgoing path.
2449  */
2450 int skb_checksum_help(struct sk_buff *skb)
2451 {
2452         __wsum csum;
2453         int ret = 0, offset;
2454
2455         if (skb->ip_summed == CHECKSUM_COMPLETE)
2456                 goto out_set_summed;
2457
2458         if (unlikely(skb_shinfo(skb)->gso_size)) {
2459                 skb_warn_bad_offload(skb);
2460                 return -EINVAL;
2461         }
2462
2463         /* Before computing a checksum, we should make sure no frag could
2464          * be modified by an external entity : checksum could be wrong.
2465          */
2466         if (skb_has_shared_frag(skb)) {
2467                 ret = __skb_linearize(skb);
2468                 if (ret)
2469                         goto out;
2470         }
2471
2472         offset = skb_checksum_start_offset(skb);
2473         BUG_ON(offset >= skb_headlen(skb));
2474         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2475
2476         offset += skb->csum_offset;
2477         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2478
2479         if (skb_cloned(skb) &&
2480             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2481                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2482                 if (ret)
2483                         goto out;
2484         }
2485
2486         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2487 out_set_summed:
2488         skb->ip_summed = CHECKSUM_NONE;
2489 out:
2490         return ret;
2491 }
2492 EXPORT_SYMBOL(skb_checksum_help);
2493
2494 /* skb_csum_offload_check - Driver helper function to determine if a device
2495  * with limited checksum offload capabilities is able to offload the checksum
2496  * for a given packet.
2497  *
2498  * Arguments:
2499  *   skb - sk_buff for the packet in question
2500  *   spec - contains the description of what device can offload
2501  *   csum_encapped - returns true if the checksum being offloaded is
2502  *            encpasulated. That is it is checksum for the transport header
2503  *            in the inner headers.
2504  *   checksum_help - when set indicates that helper function should
2505  *            call skb_checksum_help if offload checks fail
2506  *
2507  * Returns:
2508  *   true: Packet has passed the checksum checks and should be offloadable to
2509  *         the device (a driver may still need to check for additional
2510  *         restrictions of its device)
2511  *   false: Checksum is not offloadable. If checksum_help was set then
2512  *         skb_checksum_help was called to resolve checksum for non-GSO
2513  *         packets and when IP protocol is not SCTP
2514  */
2515 bool __skb_csum_offload_chk(struct sk_buff *skb,
2516                             const struct skb_csum_offl_spec *spec,
2517                             bool *csum_encapped,
2518                             bool csum_help)
2519 {
2520         struct iphdr *iph;
2521         struct ipv6hdr *ipv6;
2522         void *nhdr;
2523         int protocol;
2524         u8 ip_proto;
2525
2526         if (skb->protocol == htons(ETH_P_8021Q) ||
2527             skb->protocol == htons(ETH_P_8021AD)) {
2528                 if (!spec->vlan_okay)
2529                         goto need_help;
2530         }
2531
2532         /* We check whether the checksum refers to a transport layer checksum in
2533          * the outermost header or an encapsulated transport layer checksum that
2534          * corresponds to the inner headers of the skb. If the checksum is for
2535          * something else in the packet we need help.
2536          */
2537         if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2538                 /* Non-encapsulated checksum */
2539                 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2540                 nhdr = skb_network_header(skb);
2541                 *csum_encapped = false;
2542                 if (spec->no_not_encapped)
2543                         goto need_help;
2544         } else if (skb->encapsulation && spec->encap_okay &&
2545                    skb_checksum_start_offset(skb) ==
2546                    skb_inner_transport_offset(skb)) {
2547                 /* Encapsulated checksum */
2548                 *csum_encapped = true;
2549                 switch (skb->inner_protocol_type) {
2550                 case ENCAP_TYPE_ETHER:
2551                         protocol = eproto_to_ipproto(skb->inner_protocol);
2552                         break;
2553                 case ENCAP_TYPE_IPPROTO:
2554                         protocol = skb->inner_protocol;
2555                         break;
2556                 }
2557                 nhdr = skb_inner_network_header(skb);
2558         } else {
2559                 goto need_help;
2560         }
2561
2562         switch (protocol) {
2563         case IPPROTO_IP:
2564                 if (!spec->ipv4_okay)
2565                         goto need_help;
2566                 iph = nhdr;
2567                 ip_proto = iph->protocol;
2568                 if (iph->ihl != 5 && !spec->ip_options_okay)
2569                         goto need_help;
2570                 break;
2571         case IPPROTO_IPV6:
2572                 if (!spec->ipv6_okay)
2573                         goto need_help;
2574                 if (spec->no_encapped_ipv6 && *csum_encapped)
2575                         goto need_help;
2576                 ipv6 = nhdr;
2577                 nhdr += sizeof(*ipv6);
2578                 ip_proto = ipv6->nexthdr;
2579                 break;
2580         default:
2581                 goto need_help;
2582         }
2583
2584 ip_proto_again:
2585         switch (ip_proto) {
2586         case IPPROTO_TCP:
2587                 if (!spec->tcp_okay ||
2588                     skb->csum_offset != offsetof(struct tcphdr, check))
2589                         goto need_help;
2590                 break;
2591         case IPPROTO_UDP:
2592                 if (!spec->udp_okay ||
2593                     skb->csum_offset != offsetof(struct udphdr, check))
2594                         goto need_help;
2595                 break;
2596         case IPPROTO_SCTP:
2597                 if (!spec->sctp_okay ||
2598                     skb->csum_offset != offsetof(struct sctphdr, checksum))
2599                         goto cant_help;
2600                 break;
2601         case NEXTHDR_HOP:
2602         case NEXTHDR_ROUTING:
2603         case NEXTHDR_DEST: {
2604                 u8 *opthdr = nhdr;
2605
2606                 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2607                         goto need_help;
2608
2609                 ip_proto = opthdr[0];
2610                 nhdr += (opthdr[1] + 1) << 3;
2611
2612                 goto ip_proto_again;
2613         }
2614         default:
2615                 goto need_help;
2616         }
2617
2618         /* Passed the tests for offloading checksum */
2619         return true;
2620
2621 need_help:
2622         if (csum_help && !skb_shinfo(skb)->gso_size)
2623                 skb_checksum_help(skb);
2624 cant_help:
2625         return false;
2626 }
2627 EXPORT_SYMBOL(__skb_csum_offload_chk);
2628
2629 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2630 {
2631         __be16 type = skb->protocol;
2632
2633         /* Tunnel gso handlers can set protocol to ethernet. */
2634         if (type == htons(ETH_P_TEB)) {
2635                 struct ethhdr *eth;
2636
2637                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2638                         return 0;
2639
2640                 eth = (struct ethhdr *)skb_mac_header(skb);
2641                 type = eth->h_proto;
2642         }
2643
2644         return __vlan_get_protocol(skb, type, depth);
2645 }
2646
2647 /**
2648  *      skb_mac_gso_segment - mac layer segmentation handler.
2649  *      @skb: buffer to segment
2650  *      @features: features for the output path (see dev->features)
2651  */
2652 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2653                                     netdev_features_t features)
2654 {
2655         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2656         struct packet_offload *ptype;
2657         int vlan_depth = skb->mac_len;
2658         __be16 type = skb_network_protocol(skb, &vlan_depth);
2659
2660         if (unlikely(!type))
2661                 return ERR_PTR(-EINVAL);
2662
2663         __skb_pull(skb, vlan_depth);
2664
2665         rcu_read_lock();
2666         list_for_each_entry_rcu(ptype, &offload_base, list) {
2667                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2668                         segs = ptype->callbacks.gso_segment(skb, features);
2669                         break;
2670                 }
2671         }
2672         rcu_read_unlock();
2673
2674         __skb_push(skb, skb->data - skb_mac_header(skb));
2675
2676         return segs;
2677 }
2678 EXPORT_SYMBOL(skb_mac_gso_segment);
2679
2680
2681 /* openvswitch calls this on rx path, so we need a different check.
2682  */
2683 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2684 {
2685         if (tx_path)
2686                 return skb->ip_summed != CHECKSUM_PARTIAL;
2687         else
2688                 return skb->ip_summed == CHECKSUM_NONE;
2689 }
2690
2691 /**
2692  *      __skb_gso_segment - Perform segmentation on skb.
2693  *      @skb: buffer to segment
2694  *      @features: features for the output path (see dev->features)
2695  *      @tx_path: whether it is called in TX path
2696  *
2697  *      This function segments the given skb and returns a list of segments.
2698  *
2699  *      It may return NULL if the skb requires no segmentation.  This is
2700  *      only possible when GSO is used for verifying header integrity.
2701  *
2702  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2703  */
2704 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2705                                   netdev_features_t features, bool tx_path)
2706 {
2707         if (unlikely(skb_needs_check(skb, tx_path))) {
2708                 int err;
2709
2710                 skb_warn_bad_offload(skb);
2711
2712                 err = skb_cow_head(skb, 0);
2713                 if (err < 0)
2714                         return ERR_PTR(err);
2715         }
2716
2717         /* Only report GSO partial support if it will enable us to
2718          * support segmentation on this frame without needing additional
2719          * work.
2720          */
2721         if (features & NETIF_F_GSO_PARTIAL) {
2722                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2723                 struct net_device *dev = skb->dev;
2724
2725                 partial_features |= dev->features & dev->gso_partial_features;
2726                 if (!skb_gso_ok(skb, features | partial_features))
2727                         features &= ~NETIF_F_GSO_PARTIAL;
2728         }
2729
2730         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2731                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2732
2733         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2734         SKB_GSO_CB(skb)->encap_level = 0;
2735
2736         skb_reset_mac_header(skb);
2737         skb_reset_mac_len(skb);
2738
2739         return skb_mac_gso_segment(skb, features);
2740 }
2741 EXPORT_SYMBOL(__skb_gso_segment);
2742
2743 /* Take action when hardware reception checksum errors are detected. */
2744 #ifdef CONFIG_BUG
2745 void netdev_rx_csum_fault(struct net_device *dev)
2746 {
2747         if (net_ratelimit()) {
2748                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2749                 dump_stack();
2750         }
2751 }
2752 EXPORT_SYMBOL(netdev_rx_csum_fault);
2753 #endif
2754
2755 /* Actually, we should eliminate this check as soon as we know, that:
2756  * 1. IOMMU is present and allows to map all the memory.
2757  * 2. No high memory really exists on this machine.
2758  */
2759
2760 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2761 {
2762 #ifdef CONFIG_HIGHMEM
2763         int i;
2764         if (!(dev->features & NETIF_F_HIGHDMA)) {
2765                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2766                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2767                         if (PageHighMem(skb_frag_page(frag)))
2768                                 return 1;
2769                 }
2770         }
2771
2772         if (PCI_DMA_BUS_IS_PHYS) {
2773                 struct device *pdev = dev->dev.parent;
2774
2775                 if (!pdev)
2776                         return 0;
2777                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2778                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2779                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2780                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2781                                 return 1;
2782                 }
2783         }
2784 #endif
2785         return 0;
2786 }
2787
2788 /* If MPLS offload request, verify we are testing hardware MPLS features
2789  * instead of standard features for the netdev.
2790  */
2791 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2792 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2793                                            netdev_features_t features,
2794                                            __be16 type)
2795 {
2796         if (eth_p_mpls(type))
2797                 features &= skb->dev->mpls_features;
2798
2799         return features;
2800 }
2801 #else
2802 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2803                                            netdev_features_t features,
2804                                            __be16 type)
2805 {
2806         return features;
2807 }
2808 #endif
2809
2810 static netdev_features_t harmonize_features(struct sk_buff *skb,
2811         netdev_features_t features)
2812 {
2813         int tmp;
2814         __be16 type;
2815
2816         type = skb_network_protocol(skb, &tmp);
2817         features = net_mpls_features(skb, features, type);
2818
2819         if (skb->ip_summed != CHECKSUM_NONE &&
2820             !can_checksum_protocol(features, type)) {
2821                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2822         } else if (illegal_highdma(skb->dev, skb)) {
2823                 features &= ~NETIF_F_SG;
2824         }
2825
2826         return features;
2827 }
2828
2829 netdev_features_t passthru_features_check(struct sk_buff *skb,
2830                                           struct net_device *dev,
2831                                           netdev_features_t features)
2832 {
2833         return features;
2834 }
2835 EXPORT_SYMBOL(passthru_features_check);
2836
2837 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2838                                              struct net_device *dev,
2839                                              netdev_features_t features)
2840 {
2841         return vlan_features_check(skb, features);
2842 }
2843
2844 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2845                                             struct net_device *dev,
2846                                             netdev_features_t features)
2847 {
2848         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2849
2850         if (gso_segs > dev->gso_max_segs)
2851                 return features & ~NETIF_F_GSO_MASK;
2852
2853         /* Support for GSO partial features requires software
2854          * intervention before we can actually process the packets
2855          * so we need to strip support for any partial features now
2856          * and we can pull them back in after we have partially
2857          * segmented the frame.
2858          */
2859         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2860                 features &= ~dev->gso_partial_features;
2861
2862         /* Make sure to clear the IPv4 ID mangling feature if the
2863          * IPv4 header has the potential to be fragmented.
2864          */
2865         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2866                 struct iphdr *iph = skb->encapsulation ?
2867                                     inner_ip_hdr(skb) : ip_hdr(skb);
2868
2869                 if (!(iph->frag_off & htons(IP_DF)))
2870                         features &= ~NETIF_F_TSO_MANGLEID;
2871         }
2872
2873         return features;
2874 }
2875
2876 netdev_features_t netif_skb_features(struct sk_buff *skb)
2877 {
2878         struct net_device *dev = skb->dev;
2879         netdev_features_t features = dev->features;
2880
2881         if (skb_is_gso(skb))
2882                 features = gso_features_check(skb, dev, features);
2883
2884         /* If encapsulation offload request, verify we are testing
2885          * hardware encapsulation features instead of standard
2886          * features for the netdev
2887          */
2888         if (skb->encapsulation)
2889                 features &= dev->hw_enc_features;
2890
2891         if (skb_vlan_tagged(skb))
2892                 features = netdev_intersect_features(features,
2893                                                      dev->vlan_features |
2894                                                      NETIF_F_HW_VLAN_CTAG_TX |
2895                                                      NETIF_F_HW_VLAN_STAG_TX);
2896
2897         if (dev->netdev_ops->ndo_features_check)
2898                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2899                                                                 features);
2900         else
2901                 features &= dflt_features_check(skb, dev, features);
2902
2903         return harmonize_features(skb, features);
2904 }
2905 EXPORT_SYMBOL(netif_skb_features);
2906
2907 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2908                     struct netdev_queue *txq, bool more)
2909 {
2910         unsigned int len;
2911         int rc;
2912
2913         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2914                 dev_queue_xmit_nit(skb, dev);
2915
2916         len = skb->len;
2917         trace_net_dev_start_xmit(skb, dev);
2918         rc = netdev_start_xmit(skb, dev, txq, more);
2919         trace_net_dev_xmit(skb, rc, dev, len);
2920
2921         return rc;
2922 }
2923
2924 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2925                                     struct netdev_queue *txq, int *ret)
2926 {
2927         struct sk_buff *skb = first;
2928         int rc = NETDEV_TX_OK;
2929
2930         while (skb) {
2931                 struct sk_buff *next = skb->next;
2932
2933                 skb->next = NULL;
2934                 rc = xmit_one(skb, dev, txq, next != NULL);
2935                 if (unlikely(!dev_xmit_complete(rc))) {
2936                         skb->next = next;
2937                         goto out;
2938                 }
2939
2940                 skb = next;
2941                 if (netif_xmit_stopped(txq) && skb) {
2942                         rc = NETDEV_TX_BUSY;
2943                         break;
2944                 }
2945         }
2946
2947 out:
2948         *ret = rc;
2949         return skb;
2950 }
2951
2952 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2953                                           netdev_features_t features)
2954 {
2955         if (skb_vlan_tag_present(skb) &&
2956             !vlan_hw_offload_capable(features, skb->vlan_proto))
2957                 skb = __vlan_hwaccel_push_inside(skb);
2958         return skb;
2959 }
2960
2961 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2962 {
2963         netdev_features_t features;
2964
2965         features = netif_skb_features(skb);
2966         skb = validate_xmit_vlan(skb, features);
2967         if (unlikely(!skb))
2968                 goto out_null;
2969
2970         if (netif_needs_gso(skb, features)) {
2971                 struct sk_buff *segs;
2972
2973                 segs = skb_gso_segment(skb, features);
2974                 if (IS_ERR(segs)) {
2975                         goto out_kfree_skb;
2976                 } else if (segs) {
2977                         consume_skb(skb);
2978                         skb = segs;
2979                 }
2980         } else {
2981                 if (skb_needs_linearize(skb, features) &&
2982                     __skb_linearize(skb))
2983                         goto out_kfree_skb;
2984
2985                 /* If packet is not checksummed and device does not
2986                  * support checksumming for this protocol, complete
2987                  * checksumming here.
2988                  */
2989                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2990                         if (skb->encapsulation)
2991                                 skb_set_inner_transport_header(skb,
2992                                                                skb_checksum_start_offset(skb));
2993                         else
2994                                 skb_set_transport_header(skb,
2995                                                          skb_checksum_start_offset(skb));
2996                         if (!(features & NETIF_F_CSUM_MASK) &&
2997                             skb_checksum_help(skb))
2998                                 goto out_kfree_skb;
2999                 }
3000         }
3001
3002         return skb;
3003
3004 out_kfree_skb:
3005         kfree_skb(skb);
3006 out_null:
3007         atomic_long_inc(&dev->tx_dropped);
3008         return NULL;
3009 }
3010
3011 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3012 {
3013         struct sk_buff *next, *head = NULL, *tail;
3014
3015         for (; skb != NULL; skb = next) {
3016                 next = skb->next;
3017                 skb->next = NULL;
3018
3019                 /* in case skb wont be segmented, point to itself */
3020                 skb->prev = skb;
3021
3022                 skb = validate_xmit_skb(skb, dev);
3023                 if (!skb)
3024                         continue;
3025
3026                 if (!head)
3027                         head = skb;
3028                 else
3029                         tail->next = skb;
3030                 /* If skb was segmented, skb->prev points to
3031                  * the last segment. If not, it still contains skb.
3032                  */
3033                 tail = skb->prev;
3034         }
3035         return head;
3036 }
3037
3038 static void qdisc_pkt_len_init(struct sk_buff *skb)
3039 {
3040         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3041
3042         qdisc_skb_cb(skb)->pkt_len = skb->len;
3043
3044         /* To get more precise estimation of bytes sent on wire,
3045          * we add to pkt_len the headers size of all segments
3046          */
3047         if (shinfo->gso_size)  {
3048                 unsigned int hdr_len;
3049                 u16 gso_segs = shinfo->gso_segs;
3050
3051                 /* mac layer + network layer */
3052                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3053
3054                 /* + transport layer */
3055                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3056                         hdr_len += tcp_hdrlen(skb);
3057                 else
3058                         hdr_len += sizeof(struct udphdr);
3059
3060                 if (shinfo->gso_type & SKB_GSO_DODGY)
3061                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3062                                                 shinfo->gso_size);
3063
3064                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3065         }
3066 }
3067
3068 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3069                                  struct net_device *dev,
3070                                  struct netdev_queue *txq)
3071 {
3072         spinlock_t *root_lock = qdisc_lock(q);
3073         bool contended;
3074         int rc;
3075
3076         qdisc_calculate_pkt_len(skb, q);
3077         /*
3078          * Heuristic to force contended enqueues to serialize on a
3079          * separate lock before trying to get qdisc main lock.
3080          * This permits qdisc->running owner to get the lock more
3081          * often and dequeue packets faster.
3082          */
3083         contended = qdisc_is_running(q);
3084         if (unlikely(contended))
3085                 spin_lock(&q->busylock);
3086
3087         spin_lock(root_lock);
3088         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089                 kfree_skb(skb);
3090                 rc = NET_XMIT_DROP;
3091         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092                    qdisc_run_begin(q)) {
3093                 /*
3094                  * This is a work-conserving queue; there are no old skbs
3095                  * waiting to be sent out; and the qdisc is not running -
3096                  * xmit the skb directly.
3097                  */
3098
3099                 qdisc_bstats_update(q, skb);
3100
3101                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102                         if (unlikely(contended)) {
3103                                 spin_unlock(&q->busylock);
3104                                 contended = false;
3105                         }
3106                         __qdisc_run(q);
3107                 } else
3108                         qdisc_run_end(q);
3109
3110                 rc = NET_XMIT_SUCCESS;
3111         } else {
3112                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3113                 if (qdisc_run_begin(q)) {
3114                         if (unlikely(contended)) {
3115                                 spin_unlock(&q->busylock);
3116                                 contended = false;
3117                         }
3118                         __qdisc_run(q);
3119                 }
3120         }
3121         spin_unlock(root_lock);
3122         if (unlikely(contended))
3123                 spin_unlock(&q->busylock);
3124         return rc;
3125 }
3126
3127 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3128 static void skb_update_prio(struct sk_buff *skb)
3129 {
3130         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3131
3132         if (!skb->priority && skb->sk && map) {
3133                 unsigned int prioidx =
3134                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3135
3136                 if (prioidx < map->priomap_len)
3137                         skb->priority = map->priomap[prioidx];
3138         }
3139 }
3140 #else
3141 #define skb_update_prio(skb)
3142 #endif
3143
3144 DEFINE_PER_CPU(int, xmit_recursion);
3145 EXPORT_SYMBOL(xmit_recursion);
3146
3147 /**
3148  *      dev_loopback_xmit - loop back @skb
3149  *      @net: network namespace this loopback is happening in
3150  *      @sk:  sk needed to be a netfilter okfn
3151  *      @skb: buffer to transmit
3152  */
3153 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3154 {
3155         skb_reset_mac_header(skb);
3156         __skb_pull(skb, skb_network_offset(skb));
3157         skb->pkt_type = PACKET_LOOPBACK;
3158         skb->ip_summed = CHECKSUM_UNNECESSARY;
3159         WARN_ON(!skb_dst(skb));
3160         skb_dst_force(skb);
3161         netif_rx_ni(skb);
3162         return 0;
3163 }
3164 EXPORT_SYMBOL(dev_loopback_xmit);
3165
3166 #ifdef CONFIG_NET_EGRESS
3167 static struct sk_buff *
3168 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3169 {
3170         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3171         struct tcf_result cl_res;
3172
3173         if (!cl)
3174                 return skb;
3175
3176         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3177          * earlier by the caller.
3178          */
3179         qdisc_bstats_cpu_update(cl->q, skb);
3180
3181         switch (tc_classify(skb, cl, &cl_res, false)) {
3182         case TC_ACT_OK:
3183         case TC_ACT_RECLASSIFY:
3184                 skb->tc_index = TC_H_MIN(cl_res.classid);
3185                 break;
3186         case TC_ACT_SHOT:
3187                 qdisc_qstats_cpu_drop(cl->q);
3188                 *ret = NET_XMIT_DROP;
3189                 kfree_skb(skb);
3190                 return NULL;
3191         case TC_ACT_STOLEN:
3192         case TC_ACT_QUEUED:
3193                 *ret = NET_XMIT_SUCCESS;
3194                 consume_skb(skb);
3195                 return NULL;
3196         case TC_ACT_REDIRECT:
3197                 /* No need to push/pop skb's mac_header here on egress! */
3198                 skb_do_redirect(skb);
3199                 *ret = NET_XMIT_SUCCESS;
3200                 return NULL;
3201         default:
3202                 break;
3203         }
3204
3205         return skb;
3206 }
3207 #endif /* CONFIG_NET_EGRESS */
3208
3209 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3210 {
3211 #ifdef CONFIG_XPS
3212         struct xps_dev_maps *dev_maps;
3213         struct xps_map *map;
3214         int queue_index = -1;
3215
3216         rcu_read_lock();
3217         dev_maps = rcu_dereference(dev->xps_maps);
3218         if (dev_maps) {
3219                 map = rcu_dereference(
3220                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3221                 if (map) {
3222                         if (map->len == 1)
3223                                 queue_index = map->queues[0];
3224                         else
3225                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3226                                                                            map->len)];
3227                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3228                                 queue_index = -1;
3229                 }
3230         }
3231         rcu_read_unlock();
3232
3233         return queue_index;
3234 #else
3235         return -1;
3236 #endif
3237 }
3238
3239 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3240 {
3241         struct sock *sk = skb->sk;
3242         int queue_index = sk_tx_queue_get(sk);
3243
3244         if (queue_index < 0 || skb->ooo_okay ||
3245             queue_index >= dev->real_num_tx_queues) {
3246                 int new_index = get_xps_queue(dev, skb);
3247                 if (new_index < 0)
3248                         new_index = skb_tx_hash(dev, skb);
3249
3250                 if (queue_index != new_index && sk &&
3251                     sk_fullsock(sk) &&
3252                     rcu_access_pointer(sk->sk_dst_cache))
3253                         sk_tx_queue_set(sk, new_index);
3254
3255                 queue_index = new_index;
3256         }
3257
3258         return queue_index;
3259 }
3260
3261 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3262                                     struct sk_buff *skb,
3263                                     void *accel_priv)
3264 {
3265         int queue_index = 0;
3266
3267 #ifdef CONFIG_XPS
3268         u32 sender_cpu = skb->sender_cpu - 1;
3269
3270         if (sender_cpu >= (u32)NR_CPUS)
3271                 skb->sender_cpu = raw_smp_processor_id() + 1;
3272 #endif
3273
3274         if (dev->real_num_tx_queues != 1) {
3275                 const struct net_device_ops *ops = dev->netdev_ops;
3276                 if (ops->ndo_select_queue)
3277                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3278                                                             __netdev_pick_tx);
3279                 else
3280                         queue_index = __netdev_pick_tx(dev, skb);
3281
3282                 if (!accel_priv)
3283                         queue_index = netdev_cap_txqueue(dev, queue_index);
3284         }
3285
3286         skb_set_queue_mapping(skb, queue_index);
3287         return netdev_get_tx_queue(dev, queue_index);
3288 }
3289
3290 /**
3291  *      __dev_queue_xmit - transmit a buffer
3292  *      @skb: buffer to transmit
3293  *      @accel_priv: private data used for L2 forwarding offload
3294  *
3295  *      Queue a buffer for transmission to a network device. The caller must
3296  *      have set the device and priority and built the buffer before calling
3297  *      this function. The function can be called from an interrupt.
3298  *
3299  *      A negative errno code is returned on a failure. A success does not
3300  *      guarantee the frame will be transmitted as it may be dropped due
3301  *      to congestion or traffic shaping.
3302  *
3303  * -----------------------------------------------------------------------------------
3304  *      I notice this method can also return errors from the queue disciplines,
3305  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3306  *      be positive.
3307  *
3308  *      Regardless of the return value, the skb is consumed, so it is currently
3309  *      difficult to retry a send to this method.  (You can bump the ref count
3310  *      before sending to hold a reference for retry if you are careful.)
3311  *
3312  *      When calling this method, interrupts MUST be enabled.  This is because
3313  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3314  *          --BLG
3315  */
3316 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3317 {
3318         struct net_device *dev = skb->dev;
3319         struct netdev_queue *txq;
3320         struct Qdisc *q;
3321         int rc = -ENOMEM;
3322
3323         skb_reset_mac_header(skb);
3324
3325         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3326                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3327
3328         /* Disable soft irqs for various locks below. Also
3329          * stops preemption for RCU.
3330          */
3331         rcu_read_lock_bh();
3332
3333         skb_update_prio(skb);
3334
3335         qdisc_pkt_len_init(skb);
3336 #ifdef CONFIG_NET_CLS_ACT
3337         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3338 # ifdef CONFIG_NET_EGRESS
3339         if (static_key_false(&egress_needed)) {
3340                 skb = sch_handle_egress(skb, &rc, dev);
3341                 if (!skb)
3342                         goto out;
3343         }
3344 # endif
3345 #endif
3346         /* If device/qdisc don't need skb->dst, release it right now while
3347          * its hot in this cpu cache.
3348          */
3349         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3350                 skb_dst_drop(skb);
3351         else
3352                 skb_dst_force(skb);
3353
3354 #ifdef CONFIG_NET_SWITCHDEV
3355         /* Don't forward if offload device already forwarded */
3356         if (skb->offload_fwd_mark &&
3357             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3358                 consume_skb(skb);
3359                 rc = NET_XMIT_SUCCESS;
3360                 goto out;
3361         }
3362 #endif
3363
3364         txq = netdev_pick_tx(dev, skb, accel_priv);
3365         q = rcu_dereference_bh(txq->qdisc);
3366
3367         trace_net_dev_queue(skb);
3368         if (q->enqueue) {
3369                 rc = __dev_xmit_skb(skb, q, dev, txq);
3370                 goto out;
3371         }
3372
3373         /* The device has no queue. Common case for software devices:
3374            loopback, all the sorts of tunnels...
3375
3376            Really, it is unlikely that netif_tx_lock protection is necessary
3377            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3378            counters.)
3379            However, it is possible, that they rely on protection
3380            made by us here.
3381
3382            Check this and shot the lock. It is not prone from deadlocks.
3383            Either shot noqueue qdisc, it is even simpler 8)
3384          */
3385         if (dev->flags & IFF_UP) {
3386                 int cpu = smp_processor_id(); /* ok because BHs are off */
3387
3388                 if (txq->xmit_lock_owner != cpu) {
3389                         if (unlikely(__this_cpu_read(xmit_recursion) >
3390                                      XMIT_RECURSION_LIMIT))
3391                                 goto recursion_alert;
3392
3393                         skb = validate_xmit_skb(skb, dev);
3394                         if (!skb)
3395                                 goto out;
3396
3397                         HARD_TX_LOCK(dev, txq, cpu);
3398
3399                         if (!netif_xmit_stopped(txq)) {
3400                                 __this_cpu_inc(xmit_recursion);
3401                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3402                                 __this_cpu_dec(xmit_recursion);
3403                                 if (dev_xmit_complete(rc)) {
3404                                         HARD_TX_UNLOCK(dev, txq);
3405                                         goto out;
3406                                 }
3407                         }
3408                         HARD_TX_UNLOCK(dev, txq);
3409                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3410                                              dev->name);
3411                 } else {
3412                         /* Recursion is detected! It is possible,
3413                          * unfortunately
3414                          */
3415 recursion_alert:
3416                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3417                                              dev->name);
3418                 }
3419         }
3420
3421         rc = -ENETDOWN;
3422         rcu_read_unlock_bh();
3423
3424         atomic_long_inc(&dev->tx_dropped);
3425         kfree_skb_list(skb);
3426         return rc;
3427 out:
3428         rcu_read_unlock_bh();
3429         return rc;
3430 }
3431
3432 int dev_queue_xmit(struct sk_buff *skb)
3433 {
3434         return __dev_queue_xmit(skb, NULL);
3435 }
3436 EXPORT_SYMBOL(dev_queue_xmit);
3437
3438 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3439 {
3440         return __dev_queue_xmit(skb, accel_priv);
3441 }
3442 EXPORT_SYMBOL(dev_queue_xmit_accel);
3443
3444
3445 /*=======================================================================
3446                         Receiver routines
3447   =======================================================================*/
3448
3449 int netdev_max_backlog __read_mostly = 1000;
3450 EXPORT_SYMBOL(netdev_max_backlog);
3451
3452 int netdev_tstamp_prequeue __read_mostly = 1;
3453 int netdev_budget __read_mostly = 300;
3454 int weight_p __read_mostly = 64;            /* old backlog weight */
3455
3456 /* Called with irq disabled */
3457 static inline void ____napi_schedule(struct softnet_data *sd,
3458                                      struct napi_struct *napi)
3459 {
3460         list_add_tail(&napi->poll_list, &sd->poll_list);
3461         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3462 }
3463
3464 #ifdef CONFIG_RPS
3465
3466 /* One global table that all flow-based protocols share. */
3467 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3468 EXPORT_SYMBOL(rps_sock_flow_table);
3469 u32 rps_cpu_mask __read_mostly;
3470 EXPORT_SYMBOL(rps_cpu_mask);
3471
3472 struct static_key rps_needed __read_mostly;
3473 EXPORT_SYMBOL(rps_needed);
3474
3475 static struct rps_dev_flow *
3476 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477             struct rps_dev_flow *rflow, u16 next_cpu)
3478 {
3479         if (next_cpu < nr_cpu_ids) {
3480 #ifdef CONFIG_RFS_ACCEL
3481                 struct netdev_rx_queue *rxqueue;
3482                 struct rps_dev_flow_table *flow_table;
3483                 struct rps_dev_flow *old_rflow;
3484                 u32 flow_id;
3485                 u16 rxq_index;
3486                 int rc;
3487
3488                 /* Should we steer this flow to a different hardware queue? */
3489                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490                     !(dev->features & NETIF_F_NTUPLE))
3491                         goto out;
3492                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493                 if (rxq_index == skb_get_rx_queue(skb))
3494                         goto out;
3495
3496                 rxqueue = dev->_rx + rxq_index;
3497                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498                 if (!flow_table)
3499                         goto out;
3500                 flow_id = skb_get_hash(skb) & flow_table->mask;
3501                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502                                                         rxq_index, flow_id);
3503                 if (rc < 0)
3504                         goto out;
3505                 old_rflow = rflow;
3506                 rflow = &flow_table->flows[flow_id];
3507                 rflow->filter = rc;
3508                 if (old_rflow->filter == rflow->filter)
3509                         old_rflow->filter = RPS_NO_FILTER;
3510         out:
3511 #endif
3512                 rflow->last_qtail =
3513                         per_cpu(softnet_data, next_cpu).input_queue_head;
3514         }
3515
3516         rflow->cpu = next_cpu;
3517         return rflow;
3518 }
3519
3520 /*
3521  * get_rps_cpu is called from netif_receive_skb and returns the target
3522  * CPU from the RPS map of the receiving queue for a given skb.
3523  * rcu_read_lock must be held on entry.
3524  */
3525 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526                        struct rps_dev_flow **rflowp)
3527 {
3528         const struct rps_sock_flow_table *sock_flow_table;
3529         struct netdev_rx_queue *rxqueue = dev->_rx;
3530         struct rps_dev_flow_table *flow_table;
3531         struct rps_map *map;
3532         int cpu = -1;
3533         u32 tcpu;
3534         u32 hash;
3535
3536         if (skb_rx_queue_recorded(skb)) {
3537                 u16 index = skb_get_rx_queue(skb);
3538
3539                 if (unlikely(index >= dev->real_num_rx_queues)) {
3540                         WARN_ONCE(dev->real_num_rx_queues > 1,
3541                                   "%s received packet on queue %u, but number "
3542                                   "of RX queues is %u\n",
3543                                   dev->name, index, dev->real_num_rx_queues);
3544                         goto done;
3545                 }
3546                 rxqueue += index;
3547         }
3548
3549         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552         map = rcu_dereference(rxqueue->rps_map);
3553         if (!flow_table && !map)
3554                 goto done;
3555
3556         skb_reset_network_header(skb);
3557         hash = skb_get_hash(skb);
3558         if (!hash)
3559                 goto done;
3560
3561         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562         if (flow_table && sock_flow_table) {
3563                 struct rps_dev_flow *rflow;
3564                 u32 next_cpu;
3565                 u32 ident;
3566
3567                 /* First check into global flow table if there is a match */
3568                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569                 if ((ident ^ hash) & ~rps_cpu_mask)
3570                         goto try_rps;
3571
3572                 next_cpu = ident & rps_cpu_mask;
3573
3574                 /* OK, now we know there is a match,
3575                  * we can look at the local (per receive queue) flow table
3576                  */
3577                 rflow = &flow_table->flows[hash & flow_table->mask];
3578                 tcpu = rflow->cpu;
3579
3580                 /*
3581                  * If the desired CPU (where last recvmsg was done) is
3582                  * different from current CPU (one in the rx-queue flow
3583                  * table entry), switch if one of the following holds:
3584                  *   - Current CPU is unset (>= nr_cpu_ids).
3585                  *   - Current CPU is offline.
3586                  *   - The current CPU's queue tail has advanced beyond the
3587                  *     last packet that was enqueued using this table entry.
3588                  *     This guarantees that all previous packets for the flow
3589                  *     have been dequeued, thus preserving in order delivery.
3590                  */
3591                 if (unlikely(tcpu != next_cpu) &&
3592                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594                       rflow->last_qtail)) >= 0)) {
3595                         tcpu = next_cpu;
3596                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597                 }
3598
3599                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600                         *rflowp = rflow;
3601                         cpu = tcpu;
3602                         goto done;
3603                 }
3604         }
3605
3606 try_rps:
3607
3608         if (map) {
3609                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610                 if (cpu_online(tcpu)) {
3611                         cpu = tcpu;
3612                         goto done;
3613                 }
3614         }
3615
3616 done:
3617         return cpu;
3618 }
3619
3620 #ifdef CONFIG_RFS_ACCEL
3621
3622 /**
3623  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624  * @dev: Device on which the filter was set
3625  * @rxq_index: RX queue index
3626  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628  *
3629  * Drivers that implement ndo_rx_flow_steer() should periodically call
3630  * this function for each installed filter and remove the filters for
3631  * which it returns %true.
3632  */
3633 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634                          u32 flow_id, u16 filter_id)
3635 {
3636         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637         struct rps_dev_flow_table *flow_table;
3638         struct rps_dev_flow *rflow;
3639         bool expire = true;
3640         unsigned int cpu;
3641
3642         rcu_read_lock();
3643         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644         if (flow_table && flow_id <= flow_table->mask) {
3645                 rflow = &flow_table->flows[flow_id];
3646                 cpu = ACCESS_ONCE(rflow->cpu);
3647                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649                            rflow->last_qtail) <
3650                      (int)(10 * flow_table->mask)))
3651                         expire = false;
3652         }
3653         rcu_read_unlock();
3654         return expire;
3655 }
3656 EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658 #endif /* CONFIG_RFS_ACCEL */
3659
3660 /* Called from hardirq (IPI) context */
3661 static void rps_trigger_softirq(void *data)
3662 {
3663         struct softnet_data *sd = data;
3664
3665         ____napi_schedule(sd, &sd->backlog);
3666         sd->received_rps++;
3667 }
3668
3669 #endif /* CONFIG_RPS */
3670
3671 /*
3672  * Check if this softnet_data structure is another cpu one
3673  * If yes, queue it to our IPI list and return 1
3674  * If no, return 0
3675  */
3676 static int rps_ipi_queued(struct softnet_data *sd)
3677 {
3678 #ifdef CONFIG_RPS
3679         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681         if (sd != mysd) {
3682                 sd->rps_ipi_next = mysd->rps_ipi_list;
3683                 mysd->rps_ipi_list = sd;
3684
3685                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686                 return 1;
3687         }
3688 #endif /* CONFIG_RPS */
3689         return 0;
3690 }
3691
3692 #ifdef CONFIG_NET_FLOW_LIMIT
3693 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694 #endif
3695
3696 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697 {
3698 #ifdef CONFIG_NET_FLOW_LIMIT
3699         struct sd_flow_limit *fl;
3700         struct softnet_data *sd;
3701         unsigned int old_flow, new_flow;
3702
3703         if (qlen < (netdev_max_backlog >> 1))
3704                 return false;
3705
3706         sd = this_cpu_ptr(&softnet_data);
3707
3708         rcu_read_lock();
3709         fl = rcu_dereference(sd->flow_limit);
3710         if (fl) {
3711                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712                 old_flow = fl->history[fl->history_head];
3713                 fl->history[fl->history_head] = new_flow;
3714
3715                 fl->history_head++;
3716                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718                 if (likely(fl->buckets[old_flow]))
3719                         fl->buckets[old_flow]--;
3720
3721                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722                         fl->count++;
3723                         rcu_read_unlock();
3724                         return true;
3725                 }
3726         }
3727         rcu_read_unlock();
3728 #endif
3729         return false;
3730 }
3731
3732 /*
3733  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734  * queue (may be a remote CPU queue).
3735  */
3736 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737                               unsigned int *qtail)
3738 {
3739         struct softnet_data *sd;
3740         unsigned long flags;
3741         unsigned int qlen;
3742
3743         sd = &per_cpu(softnet_data, cpu);
3744
3745         local_irq_save(flags);
3746
3747         rps_lock(sd);
3748         if (!netif_running(skb->dev))
3749                 goto drop;
3750         qlen = skb_queue_len(&sd->input_pkt_queue);
3751         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752                 if (qlen) {
3753 enqueue:
3754                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3755                         input_queue_tail_incr_save(sd, qtail);
3756                         rps_unlock(sd);
3757                         local_irq_restore(flags);
3758                         return NET_RX_SUCCESS;
3759                 }
3760
3761                 /* Schedule NAPI for backlog device
3762                  * We can use non atomic operation since we own the queue lock
3763                  */
3764                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765                         if (!rps_ipi_queued(sd))
3766                                 ____napi_schedule(sd, &sd->backlog);
3767                 }
3768                 goto enqueue;
3769         }
3770
3771 drop:
3772         sd->dropped++;
3773         rps_unlock(sd);
3774
3775         local_irq_restore(flags);
3776
3777         atomic_long_inc(&skb->dev->rx_dropped);
3778         kfree_skb(skb);
3779         return NET_RX_DROP;
3780 }
3781
3782 static int netif_rx_internal(struct sk_buff *skb)
3783 {
3784         int ret;
3785
3786         net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788         trace_netif_rx(skb);
3789 #ifdef CONFIG_RPS
3790         if (static_key_false(&rps_needed)) {
3791                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3792                 int cpu;
3793
3794                 preempt_disable();
3795                 rcu_read_lock();
3796
3797                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798                 if (cpu < 0)
3799                         cpu = smp_processor_id();
3800
3801                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803                 rcu_read_unlock();
3804                 preempt_enable();
3805         } else
3806 #endif
3807         {
3808                 unsigned int qtail;
3809                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810                 put_cpu();
3811         }
3812         return ret;
3813 }
3814
3815 /**
3816  *      netif_rx        -       post buffer to the network code
3817  *      @skb: buffer to post
3818  *
3819  *      This function receives a packet from a device driver and queues it for
3820  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3821  *      may be dropped during processing for congestion control or by the
3822  *      protocol layers.
3823  *
3824  *      return values:
3825  *      NET_RX_SUCCESS  (no congestion)
3826  *      NET_RX_DROP     (packet was dropped)
3827  *
3828  */
3829
3830 int netif_rx(struct sk_buff *skb)
3831 {
3832         trace_netif_rx_entry(skb);
3833
3834         return netif_rx_internal(skb);
3835 }
3836 EXPORT_SYMBOL(netif_rx);
3837
3838 int netif_rx_ni(struct sk_buff *skb)
3839 {
3840         int err;
3841
3842         trace_netif_rx_ni_entry(skb);
3843
3844         preempt_disable();
3845         err = netif_rx_internal(skb);
3846         if (local_softirq_pending())
3847                 do_softirq();
3848         preempt_enable();
3849
3850         return err;
3851 }
3852 EXPORT_SYMBOL(netif_rx_ni);
3853
3854 static void net_tx_action(struct softirq_action *h)
3855 {
3856         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858         if (sd->completion_queue) {
3859                 struct sk_buff *clist;
3860
3861                 local_irq_disable();
3862                 clist = sd->completion_queue;
3863                 sd->completion_queue = NULL;
3864                 local_irq_enable();
3865
3866                 while (clist) {
3867                         struct sk_buff *skb = clist;
3868                         clist = clist->next;
3869
3870                         WARN_ON(atomic_read(&skb->users));
3871                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872                                 trace_consume_skb(skb);
3873                         else
3874                                 trace_kfree_skb(skb, net_tx_action);
3875
3876                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877                                 __kfree_skb(skb);
3878                         else
3879                                 __kfree_skb_defer(skb);
3880                 }
3881
3882                 __kfree_skb_flush();
3883         }
3884
3885         if (sd->output_queue) {
3886                 struct Qdisc *head;
3887
3888                 local_irq_disable();
3889                 head = sd->output_queue;
3890                 sd->output_queue = NULL;
3891                 sd->output_queue_tailp = &sd->output_queue;
3892                 local_irq_enable();
3893
3894                 while (head) {
3895                         struct Qdisc *q = head;
3896                         spinlock_t *root_lock;
3897
3898                         head = head->next_sched;
3899
3900                         root_lock = qdisc_lock(q);
3901                         spin_lock(root_lock);
3902                         /* We need to make sure head->next_sched is read
3903                          * before clearing __QDISC_STATE_SCHED
3904                          */
3905                         smp_mb__before_atomic();
3906                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3907                         qdisc_run(q);
3908                         spin_unlock(root_lock);
3909                 }
3910         }
3911 }
3912
3913 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3914     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3915 /* This hook is defined here for ATM LANE */
3916 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3917                              unsigned char *addr) __read_mostly;
3918 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3919 #endif
3920
3921 static inline struct sk_buff *
3922 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3923                    struct net_device *orig_dev)
3924 {
3925 #ifdef CONFIG_NET_CLS_ACT
3926         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3927         struct tcf_result cl_res;
3928
3929         /* If there's at least one ingress present somewhere (so
3930          * we get here via enabled static key), remaining devices
3931          * that are not configured with an ingress qdisc will bail
3932          * out here.
3933          */
3934         if (!cl)
3935                 return skb;
3936         if (*pt_prev) {
3937                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3938                 *pt_prev = NULL;
3939         }
3940
3941         qdisc_skb_cb(skb)->pkt_len = skb->len;
3942         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3943         qdisc_bstats_cpu_update(cl->q, skb);
3944
3945         switch (tc_classify(skb, cl, &cl_res, false)) {
3946         case TC_ACT_OK:
3947         case TC_ACT_RECLASSIFY:
3948                 skb->tc_index = TC_H_MIN(cl_res.classid);
3949                 break;
3950         case TC_ACT_SHOT:
3951                 qdisc_qstats_cpu_drop(cl->q);
3952                 kfree_skb(skb);
3953                 return NULL;
3954         case TC_ACT_STOLEN:
3955         case TC_ACT_QUEUED:
3956                 consume_skb(skb);
3957                 return NULL;
3958         case TC_ACT_REDIRECT:
3959                 /* skb_mac_header check was done by cls/act_bpf, so
3960                  * we can safely push the L2 header back before
3961                  * redirecting to another netdev
3962                  */
3963                 __skb_push(skb, skb->mac_len);
3964                 skb_do_redirect(skb);
3965                 return NULL;
3966         default:
3967                 break;
3968         }
3969 #endif /* CONFIG_NET_CLS_ACT */
3970         return skb;
3971 }
3972
3973 /**
3974  *      netdev_rx_handler_register - register receive handler
3975  *      @dev: device to register a handler for
3976  *      @rx_handler: receive handler to register
3977  *      @rx_handler_data: data pointer that is used by rx handler
3978  *
3979  *      Register a receive handler for a device. This handler will then be
3980  *      called from __netif_receive_skb. A negative errno code is returned
3981  *      on a failure.
3982  *
3983  *      The caller must hold the rtnl_mutex.
3984  *
3985  *      For a general description of rx_handler, see enum rx_handler_result.
3986  */
3987 int netdev_rx_handler_register(struct net_device *dev,
3988                                rx_handler_func_t *rx_handler,
3989                                void *rx_handler_data)
3990 {
3991         ASSERT_RTNL();
3992
3993         if (dev->rx_handler)
3994                 return -EBUSY;
3995
3996         /* Note: rx_handler_data must be set before rx_handler */
3997         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3998         rcu_assign_pointer(dev->rx_handler, rx_handler);
3999
4000         return 0;
4001 }
4002 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4003
4004 /**
4005  *      netdev_rx_handler_unregister - unregister receive handler
4006  *      @dev: device to unregister a handler from
4007  *
4008  *      Unregister a receive handler from a device.
4009  *
4010  *      The caller must hold the rtnl_mutex.
4011  */
4012 void netdev_rx_handler_unregister(struct net_device *dev)
4013 {
4014
4015         ASSERT_RTNL();
4016         RCU_INIT_POINTER(dev->rx_handler, NULL);
4017         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4018          * section has a guarantee to see a non NULL rx_handler_data
4019          * as well.
4020          */
4021         synchronize_net();
4022         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4023 }
4024 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4025
4026 /*
4027  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4028  * the special handling of PFMEMALLOC skbs.
4029  */
4030 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4031 {
4032         switch (skb->protocol) {
4033         case htons(ETH_P_ARP):
4034         case htons(ETH_P_IP):
4035         case htons(ETH_P_IPV6):
4036         case htons(ETH_P_8021Q):
4037         case htons(ETH_P_8021AD):
4038                 return true;
4039         default:
4040                 return false;
4041         }
4042 }
4043
4044 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4045                              int *ret, struct net_device *orig_dev)
4046 {
4047 #ifdef CONFIG_NETFILTER_INGRESS
4048         if (nf_hook_ingress_active(skb)) {
4049                 if (*pt_prev) {
4050                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4051                         *pt_prev = NULL;
4052                 }
4053
4054                 return nf_hook_ingress(skb);
4055         }
4056 #endif /* CONFIG_NETFILTER_INGRESS */
4057         return 0;
4058 }
4059
4060 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4061 {
4062         struct packet_type *ptype, *pt_prev;
4063         rx_handler_func_t *rx_handler;
4064         struct net_device *orig_dev;
4065         bool deliver_exact = false;
4066         int ret = NET_RX_DROP;
4067         __be16 type;
4068
4069         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4070
4071         trace_netif_receive_skb(skb);
4072
4073         orig_dev = skb->dev;
4074
4075         skb_reset_network_header(skb);
4076         if (!skb_transport_header_was_set(skb))
4077                 skb_reset_transport_header(skb);
4078         skb_reset_mac_len(skb);
4079
4080         pt_prev = NULL;
4081
4082 another_round:
4083         skb->skb_iif = skb->dev->ifindex;
4084
4085         __this_cpu_inc(softnet_data.processed);
4086
4087         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4088             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4089                 skb = skb_vlan_untag(skb);
4090                 if (unlikely(!skb))
4091                         goto out;
4092         }
4093
4094 #ifdef CONFIG_NET_CLS_ACT
4095         if (skb->tc_verd & TC_NCLS) {
4096                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4097                 goto ncls;
4098         }
4099 #endif
4100
4101         if (pfmemalloc)
4102                 goto skip_taps;
4103
4104         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4105                 if (pt_prev)
4106                         ret = deliver_skb(skb, pt_prev, orig_dev);
4107                 pt_prev = ptype;
4108         }
4109
4110         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4111                 if (pt_prev)
4112                         ret = deliver_skb(skb, pt_prev, orig_dev);
4113                 pt_prev = ptype;
4114         }
4115
4116 skip_taps:
4117 #ifdef CONFIG_NET_INGRESS
4118         if (static_key_false(&ingress_needed)) {
4119                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4120                 if (!skb)
4121                         goto out;
4122
4123                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4124                         goto out;
4125         }
4126 #endif
4127 #ifdef CONFIG_NET_CLS_ACT
4128         skb->tc_verd = 0;
4129 ncls:
4130 #endif
4131         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4132                 goto drop;
4133
4134         if (skb_vlan_tag_present(skb)) {
4135                 if (pt_prev) {
4136                         ret = deliver_skb(skb, pt_prev, orig_dev);
4137                         pt_prev = NULL;
4138                 }
4139                 if (vlan_do_receive(&skb))
4140                         goto another_round;
4141                 else if (unlikely(!skb))
4142                         goto out;
4143         }
4144
4145         rx_handler = rcu_dereference(skb->dev->rx_handler);
4146         if (rx_handler) {
4147                 if (pt_prev) {
4148                         ret = deliver_skb(skb, pt_prev, orig_dev);
4149                         pt_prev = NULL;
4150                 }
4151                 switch (rx_handler(&skb)) {
4152                 case RX_HANDLER_CONSUMED:
4153                         ret = NET_RX_SUCCESS;
4154                         goto out;
4155                 case RX_HANDLER_ANOTHER:
4156                         goto another_round;
4157                 case RX_HANDLER_EXACT:
4158                         deliver_exact = true;
4159                 case RX_HANDLER_PASS:
4160                         break;
4161                 default:
4162                         BUG();
4163                 }
4164         }
4165
4166         if (unlikely(skb_vlan_tag_present(skb))) {
4167                 if (skb_vlan_tag_get_id(skb))
4168                         skb->pkt_type = PACKET_OTHERHOST;
4169                 /* Note: we might in the future use prio bits
4170                  * and set skb->priority like in vlan_do_receive()
4171                  * For the time being, just ignore Priority Code Point
4172                  */
4173                 skb->vlan_tci = 0;
4174         }
4175
4176         type = skb->protocol;
4177
4178         /* deliver only exact match when indicated */
4179         if (likely(!deliver_exact)) {
4180                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4181                                        &ptype_base[ntohs(type) &
4182                                                    PTYPE_HASH_MASK]);
4183         }
4184
4185         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4186                                &orig_dev->ptype_specific);
4187
4188         if (unlikely(skb->dev != orig_dev)) {
4189                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4190                                        &skb->dev->ptype_specific);
4191         }
4192
4193         if (pt_prev) {
4194                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4195                         goto drop;
4196                 else
4197                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4198         } else {
4199 drop:
4200                 if (!deliver_exact)
4201                         atomic_long_inc(&skb->dev->rx_dropped);
4202                 else
4203                         atomic_long_inc(&skb->dev->rx_nohandler);
4204                 kfree_skb(skb);
4205                 /* Jamal, now you will not able to escape explaining
4206                  * me how you were going to use this. :-)
4207                  */
4208                 ret = NET_RX_DROP;
4209         }
4210
4211 out:
4212         return ret;
4213 }
4214
4215 static int __netif_receive_skb(struct sk_buff *skb)
4216 {
4217         int ret;
4218
4219         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4220                 unsigned long pflags = current->flags;
4221
4222                 /*
4223                  * PFMEMALLOC skbs are special, they should
4224                  * - be delivered to SOCK_MEMALLOC sockets only
4225                  * - stay away from userspace
4226                  * - have bounded memory usage
4227                  *
4228                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4229                  * context down to all allocation sites.
4230                  */
4231                 current->flags |= PF_MEMALLOC;
4232                 ret = __netif_receive_skb_core(skb, true);
4233                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4234         } else
4235                 ret = __netif_receive_skb_core(skb, false);
4236
4237         return ret;
4238 }
4239
4240 static int netif_receive_skb_internal(struct sk_buff *skb)
4241 {
4242         int ret;
4243
4244         net_timestamp_check(netdev_tstamp_prequeue, skb);
4245
4246         if (skb_defer_rx_timestamp(skb))
4247                 return NET_RX_SUCCESS;
4248
4249         rcu_read_lock();
4250
4251 #ifdef CONFIG_RPS
4252         if (static_key_false(&rps_needed)) {
4253                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4254                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4255
4256                 if (cpu >= 0) {
4257                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4258                         rcu_read_unlock();
4259                         return ret;
4260                 }
4261         }
4262 #endif
4263         ret = __netif_receive_skb(skb);
4264         rcu_read_unlock();
4265         return ret;
4266 }
4267
4268 /**
4269  *      netif_receive_skb - process receive buffer from network
4270  *      @skb: buffer to process
4271  *
4272  *      netif_receive_skb() is the main receive data processing function.
4273  *      It always succeeds. The buffer may be dropped during processing
4274  *      for congestion control or by the protocol layers.
4275  *
4276  *      This function may only be called from softirq context and interrupts
4277  *      should be enabled.
4278  *
4279  *      Return values (usually ignored):
4280  *      NET_RX_SUCCESS: no congestion
4281  *      NET_RX_DROP: packet was dropped
4282  */
4283 int netif_receive_skb(struct sk_buff *skb)
4284 {
4285         trace_netif_receive_skb_entry(skb);
4286
4287         return netif_receive_skb_internal(skb);
4288 }
4289 EXPORT_SYMBOL(netif_receive_skb);
4290
4291 /* Network device is going away, flush any packets still pending
4292  * Called with irqs disabled.
4293  */
4294 static void flush_backlog(void *arg)
4295 {
4296         struct net_device *dev = arg;
4297         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4298         struct sk_buff *skb, *tmp;
4299
4300         rps_lock(sd);
4301         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4302                 if (skb->dev == dev) {
4303                         __skb_unlink(skb, &sd->input_pkt_queue);
4304                         kfree_skb(skb);
4305                         input_queue_head_incr(sd);
4306                 }
4307         }
4308         rps_unlock(sd);
4309
4310         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4311                 if (skb->dev == dev) {
4312                         __skb_unlink(skb, &sd->process_queue);
4313                         kfree_skb(skb);
4314                         input_queue_head_incr(sd);
4315                 }
4316         }
4317 }
4318
4319 static int napi_gro_complete(struct sk_buff *skb)
4320 {
4321         struct packet_offload *ptype;
4322         __be16 type = skb->protocol;
4323         struct list_head *head = &offload_base;
4324         int err = -ENOENT;
4325
4326         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4327
4328         if (NAPI_GRO_CB(skb)->count == 1) {
4329                 skb_shinfo(skb)->gso_size = 0;
4330                 goto out;
4331         }
4332
4333         rcu_read_lock();
4334         list_for_each_entry_rcu(ptype, head, list) {
4335                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4336                         continue;
4337
4338                 err = ptype->callbacks.gro_complete(skb, 0);
4339                 break;
4340         }
4341         rcu_read_unlock();
4342
4343         if (err) {
4344                 WARN_ON(&ptype->list == head);
4345                 kfree_skb(skb);
4346                 return NET_RX_SUCCESS;
4347         }
4348
4349 out:
4350         return netif_receive_skb_internal(skb);
4351 }
4352
4353 /* napi->gro_list contains packets ordered by age.
4354  * youngest packets at the head of it.
4355  * Complete skbs in reverse order to reduce latencies.
4356  */
4357 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4358 {
4359         struct sk_buff *skb, *prev = NULL;
4360
4361         /* scan list and build reverse chain */
4362         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4363                 skb->prev = prev;
4364                 prev = skb;
4365         }
4366
4367         for (skb = prev; skb; skb = prev) {
4368                 skb->next = NULL;
4369
4370                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4371                         return;
4372
4373                 prev = skb->prev;
4374                 napi_gro_complete(skb);
4375                 napi->gro_count--;
4376         }
4377
4378         napi->gro_list = NULL;
4379 }
4380 EXPORT_SYMBOL(napi_gro_flush);
4381
4382 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4383 {
4384         struct sk_buff *p;
4385         unsigned int maclen = skb->dev->hard_header_len;
4386         u32 hash = skb_get_hash_raw(skb);
4387
4388         for (p = napi->gro_list; p; p = p->next) {
4389                 unsigned long diffs;
4390
4391                 NAPI_GRO_CB(p)->flush = 0;
4392
4393                 if (hash != skb_get_hash_raw(p)) {
4394                         NAPI_GRO_CB(p)->same_flow = 0;
4395                         continue;
4396                 }
4397
4398                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4399                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4400                 diffs |= skb_metadata_dst_cmp(p, skb);
4401                 if (maclen == ETH_HLEN)
4402                         diffs |= compare_ether_header(skb_mac_header(p),
4403                                                       skb_mac_header(skb));
4404                 else if (!diffs)
4405                         diffs = memcmp(skb_mac_header(p),
4406                                        skb_mac_header(skb),
4407                                        maclen);
4408                 NAPI_GRO_CB(p)->same_flow = !diffs;
4409         }
4410 }
4411
4412 static void skb_gro_reset_offset(struct sk_buff *skb)
4413 {
4414         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4415         const skb_frag_t *frag0 = &pinfo->frags[0];
4416
4417         NAPI_GRO_CB(skb)->data_offset = 0;
4418         NAPI_GRO_CB(skb)->frag0 = NULL;
4419         NAPI_GRO_CB(skb)->frag0_len = 0;
4420
4421         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4422             pinfo->nr_frags &&
4423             !PageHighMem(skb_frag_page(frag0))) {
4424                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4425                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4426         }
4427 }
4428
4429 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4430 {
4431         struct skb_shared_info *pinfo = skb_shinfo(skb);
4432
4433         BUG_ON(skb->end - skb->tail < grow);
4434
4435         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4436
4437         skb->data_len -= grow;
4438         skb->tail += grow;
4439
4440         pinfo->frags[0].page_offset += grow;
4441         skb_frag_size_sub(&pinfo->frags[0], grow);
4442
4443         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4444                 skb_frag_unref(skb, 0);
4445                 memmove(pinfo->frags, pinfo->frags + 1,
4446                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4447         }
4448 }
4449
4450 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4451 {
4452         struct sk_buff **pp = NULL;
4453         struct packet_offload *ptype;
4454         __be16 type = skb->protocol;
4455         struct list_head *head = &offload_base;
4456         int same_flow;
4457         enum gro_result ret;
4458         int grow;
4459
4460         if (!(skb->dev->features & NETIF_F_GRO))
4461                 goto normal;
4462
4463         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4464                 goto normal;
4465
4466         gro_list_prepare(napi, skb);
4467
4468         rcu_read_lock();
4469         list_for_each_entry_rcu(ptype, head, list) {
4470                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4471                         continue;
4472
4473                 skb_set_network_header(skb, skb_gro_offset(skb));
4474                 skb_reset_mac_len(skb);
4475                 NAPI_GRO_CB(skb)->same_flow = 0;
4476                 NAPI_GRO_CB(skb)->flush = 0;
4477                 NAPI_GRO_CB(skb)->free = 0;
4478                 NAPI_GRO_CB(skb)->encap_mark = 0;
4479                 NAPI_GRO_CB(skb)->is_fou = 0;
4480                 NAPI_GRO_CB(skb)->is_atomic = 1;
4481                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4482
4483                 /* Setup for GRO checksum validation */
4484                 switch (skb->ip_summed) {
4485                 case CHECKSUM_COMPLETE:
4486                         NAPI_GRO_CB(skb)->csum = skb->csum;
4487                         NAPI_GRO_CB(skb)->csum_valid = 1;
4488                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4489                         break;
4490                 case CHECKSUM_UNNECESSARY:
4491                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4492                         NAPI_GRO_CB(skb)->csum_valid = 0;
4493                         break;
4494                 default:
4495                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4496                         NAPI_GRO_CB(skb)->csum_valid = 0;
4497                 }
4498
4499                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4500                 break;
4501         }
4502         rcu_read_unlock();
4503
4504         if (&ptype->list == head)
4505                 goto normal;
4506
4507         same_flow = NAPI_GRO_CB(skb)->same_flow;
4508         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4509
4510         if (pp) {
4511                 struct sk_buff *nskb = *pp;
4512
4513                 *pp = nskb->next;
4514                 nskb->next = NULL;
4515                 napi_gro_complete(nskb);
4516                 napi->gro_count--;
4517         }
4518
4519         if (same_flow)
4520                 goto ok;
4521
4522         if (NAPI_GRO_CB(skb)->flush)
4523                 goto normal;
4524
4525         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4526                 struct sk_buff *nskb = napi->gro_list;
4527
4528                 /* locate the end of the list to select the 'oldest' flow */
4529                 while (nskb->next) {
4530                         pp = &nskb->next;
4531                         nskb = *pp;
4532                 }
4533                 *pp = NULL;
4534                 nskb->next = NULL;
4535                 napi_gro_complete(nskb);
4536         } else {
4537                 napi->gro_count++;
4538         }
4539         NAPI_GRO_CB(skb)->count = 1;
4540         NAPI_GRO_CB(skb)->age = jiffies;
4541         NAPI_GRO_CB(skb)->last = skb;
4542         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4543         skb->next = napi->gro_list;
4544         napi->gro_list = skb;
4545         ret = GRO_HELD;
4546
4547 pull:
4548         grow = skb_gro_offset(skb) - skb_headlen(skb);
4549         if (grow > 0)
4550                 gro_pull_from_frag0(skb, grow);
4551 ok:
4552         return ret;
4553
4554 normal:
4555         ret = GRO_NORMAL;
4556         goto pull;
4557 }
4558
4559 struct packet_offload *gro_find_receive_by_type(__be16 type)
4560 {
4561         struct list_head *offload_head = &offload_base;
4562         struct packet_offload *ptype;
4563
4564         list_for_each_entry_rcu(ptype, offload_head, list) {
4565                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4566                         continue;
4567                 return ptype;
4568         }
4569         return NULL;
4570 }
4571 EXPORT_SYMBOL(gro_find_receive_by_type);
4572
4573 struct packet_offload *gro_find_complete_by_type(__be16 type)
4574 {
4575         struct list_head *offload_head = &offload_base;
4576         struct packet_offload *ptype;
4577
4578         list_for_each_entry_rcu(ptype, offload_head, list) {
4579                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4580                         continue;
4581                 return ptype;
4582         }
4583         return NULL;
4584 }
4585 EXPORT_SYMBOL(gro_find_complete_by_type);
4586
4587 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4588 {
4589         switch (ret) {
4590         case GRO_NORMAL:
4591                 if (netif_receive_skb_internal(skb))
4592                         ret = GRO_DROP;
4593                 break;
4594
4595         case GRO_DROP:
4596                 kfree_skb(skb);
4597                 break;
4598
4599         case GRO_MERGED_FREE:
4600                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4601                         skb_dst_drop(skb);
4602                         kmem_cache_free(skbuff_head_cache, skb);
4603                 } else {
4604                         __kfree_skb(skb);
4605                 }
4606                 break;
4607
4608         case GRO_HELD:
4609         case GRO_MERGED:
4610                 break;
4611         }
4612
4613         return ret;
4614 }
4615
4616 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4617 {
4618         skb_mark_napi_id(skb, napi);
4619         trace_napi_gro_receive_entry(skb);
4620
4621         skb_gro_reset_offset(skb);
4622
4623         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4624 }
4625 EXPORT_SYMBOL(napi_gro_receive);
4626
4627 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4628 {
4629         if (unlikely(skb->pfmemalloc)) {
4630                 consume_skb(skb);
4631                 return;
4632         }
4633         __skb_pull(skb, skb_headlen(skb));
4634         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4635         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4636         skb->vlan_tci = 0;
4637         skb->dev = napi->dev;
4638         skb->skb_iif = 0;
4639         skb->encapsulation = 0;
4640         skb_shinfo(skb)->gso_type = 0;
4641         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4642
4643         napi->skb = skb;
4644 }
4645
4646 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4647 {
4648         struct sk_buff *skb = napi->skb;
4649
4650         if (!skb) {
4651                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4652                 if (skb) {
4653                         napi->skb = skb;
4654                         skb_mark_napi_id(skb, napi);
4655                 }
4656         }
4657         return skb;
4658 }
4659 EXPORT_SYMBOL(napi_get_frags);
4660
4661 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4662                                       struct sk_buff *skb,
4663                                       gro_result_t ret)
4664 {
4665         switch (ret) {
4666         case GRO_NORMAL:
4667         case GRO_HELD:
4668                 __skb_push(skb, ETH_HLEN);
4669                 skb->protocol = eth_type_trans(skb, skb->dev);
4670                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4671                         ret = GRO_DROP;
4672                 break;
4673
4674         case GRO_DROP:
4675         case GRO_MERGED_FREE:
4676                 napi_reuse_skb(napi, skb);
4677                 break;
4678
4679         case GRO_MERGED:
4680                 break;
4681         }
4682
4683         return ret;
4684 }
4685
4686 /* Upper GRO stack assumes network header starts at gro_offset=0
4687  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4688  * We copy ethernet header into skb->data to have a common layout.
4689  */
4690 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4691 {
4692         struct sk_buff *skb = napi->skb;
4693         const struct ethhdr *eth;
4694         unsigned int hlen = sizeof(*eth);
4695
4696         napi->skb = NULL;
4697
4698         skb_reset_mac_header(skb);
4699         skb_gro_reset_offset(skb);
4700
4701         eth = skb_gro_header_fast(skb, 0);
4702         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4703                 eth = skb_gro_header_slow(skb, hlen, 0);
4704                 if (unlikely(!eth)) {
4705                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4706                                              __func__, napi->dev->name);
4707                         napi_reuse_skb(napi, skb);
4708                         return NULL;
4709                 }
4710         } else {
4711                 gro_pull_from_frag0(skb, hlen);
4712                 NAPI_GRO_CB(skb)->frag0 += hlen;
4713                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4714         }
4715         __skb_pull(skb, hlen);
4716
4717         /*
4718          * This works because the only protocols we care about don't require
4719          * special handling.
4720          * We'll fix it up properly in napi_frags_finish()
4721          */
4722         skb->protocol = eth->h_proto;
4723
4724         return skb;
4725 }
4726
4727 gro_result_t napi_gro_frags(struct napi_struct *napi)
4728 {
4729         struct sk_buff *skb = napi_frags_skb(napi);
4730
4731         if (!skb)
4732                 return GRO_DROP;
4733
4734         trace_napi_gro_frags_entry(skb);
4735
4736         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4737 }
4738 EXPORT_SYMBOL(napi_gro_frags);
4739
4740 /* Compute the checksum from gro_offset and return the folded value
4741  * after adding in any pseudo checksum.
4742  */
4743 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4744 {
4745         __wsum wsum;
4746         __sum16 sum;
4747
4748         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4749
4750         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4751         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4752         if (likely(!sum)) {
4753                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4754                     !skb->csum_complete_sw)
4755                         netdev_rx_csum_fault(skb->dev);
4756         }
4757
4758         NAPI_GRO_CB(skb)->csum = wsum;
4759         NAPI_GRO_CB(skb)->csum_valid = 1;
4760
4761         return sum;
4762 }
4763 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4764
4765 /*
4766  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4767  * Note: called with local irq disabled, but exits with local irq enabled.
4768  */
4769 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4770 {
4771 #ifdef CONFIG_RPS
4772         struct softnet_data *remsd = sd->rps_ipi_list;
4773
4774         if (remsd) {
4775                 sd->rps_ipi_list = NULL;
4776
4777                 local_irq_enable();
4778
4779                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4780                 while (remsd) {
4781                         struct softnet_data *next = remsd->rps_ipi_next;
4782
4783                         if (cpu_online(remsd->cpu))
4784                                 smp_call_function_single_async(remsd->cpu,
4785                                                            &remsd->csd);
4786                         remsd = next;
4787                 }
4788         } else
4789 #endif
4790                 local_irq_enable();
4791 }
4792
4793 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4794 {
4795 #ifdef CONFIG_RPS
4796         return sd->rps_ipi_list != NULL;
4797 #else
4798         return false;
4799 #endif
4800 }
4801
4802 static int process_backlog(struct napi_struct *napi, int quota)
4803 {
4804         int work = 0;
4805         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4806
4807         /* Check if we have pending ipi, its better to send them now,
4808          * not waiting net_rx_action() end.
4809          */
4810         if (sd_has_rps_ipi_waiting(sd)) {
4811                 local_irq_disable();
4812                 net_rps_action_and_irq_enable(sd);
4813         }
4814
4815         napi->weight = weight_p;
4816         local_irq_disable();
4817         while (1) {
4818                 struct sk_buff *skb;
4819
4820                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4821                         rcu_read_lock();
4822                         local_irq_enable();
4823                         __netif_receive_skb(skb);
4824                         rcu_read_unlock();
4825                         local_irq_disable();
4826                         input_queue_head_incr(sd);
4827                         if (++work >= quota) {
4828                                 local_irq_enable();
4829                                 return work;
4830                         }
4831                 }
4832
4833                 rps_lock(sd);
4834                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4835                         /*
4836                          * Inline a custom version of __napi_complete().
4837                          * only current cpu owns and manipulates this napi,
4838                          * and NAPI_STATE_SCHED is the only possible flag set
4839                          * on backlog.
4840                          * We can use a plain write instead of clear_bit(),
4841                          * and we dont need an smp_mb() memory barrier.
4842                          */
4843                         napi->state = 0;
4844                         rps_unlock(sd);
4845
4846                         break;
4847                 }
4848
4849                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4850                                            &sd->process_queue);
4851                 rps_unlock(sd);
4852         }
4853         local_irq_enable();
4854
4855         return work;
4856 }
4857
4858 /**
4859  * __napi_schedule - schedule for receive
4860  * @n: entry to schedule
4861  *
4862  * The entry's receive function will be scheduled to run.
4863  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4864  */
4865 void __napi_schedule(struct napi_struct *n)
4866 {
4867         unsigned long flags;
4868
4869         local_irq_save(flags);
4870         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4871         local_irq_restore(flags);
4872 }
4873 EXPORT_SYMBOL(__napi_schedule);
4874
4875 /**
4876  * __napi_schedule_irqoff - schedule for receive
4877  * @n: entry to schedule
4878  *
4879  * Variant of __napi_schedule() assuming hard irqs are masked
4880  */
4881 void __napi_schedule_irqoff(struct napi_struct *n)
4882 {
4883         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4884 }
4885 EXPORT_SYMBOL(__napi_schedule_irqoff);
4886
4887 void __napi_complete(struct napi_struct *n)
4888 {
4889         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4890
4891         list_del_init(&n->poll_list);
4892         smp_mb__before_atomic();
4893         clear_bit(NAPI_STATE_SCHED, &n->state);
4894 }
4895 EXPORT_SYMBOL(__napi_complete);
4896
4897 void napi_complete_done(struct napi_struct *n, int work_done)
4898 {
4899         unsigned long flags;
4900
4901         /*
4902          * don't let napi dequeue from the cpu poll list
4903          * just in case its running on a different cpu
4904          */
4905         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4906                 return;
4907
4908         if (n->gro_list) {
4909                 unsigned long timeout = 0;
4910
4911                 if (work_done)
4912                         timeout = n->dev->gro_flush_timeout;
4913
4914                 if (timeout)
4915                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4916                                       HRTIMER_MODE_REL_PINNED);
4917                 else
4918                         napi_gro_flush(n, false);
4919         }
4920         if (likely(list_empty(&n->poll_list))) {
4921                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4922         } else {
4923                 /* If n->poll_list is not empty, we need to mask irqs */
4924                 local_irq_save(flags);
4925                 __napi_complete(n);
4926                 local_irq_restore(flags);
4927         }
4928 }
4929 EXPORT_SYMBOL(napi_complete_done);
4930
4931 /* must be called under rcu_read_lock(), as we dont take a reference */
4932 static struct napi_struct *napi_by_id(unsigned int napi_id)
4933 {
4934         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4935         struct napi_struct *napi;
4936
4937         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4938                 if (napi->napi_id == napi_id)
4939                         return napi;
4940
4941         return NULL;
4942 }
4943
4944 #if defined(CONFIG_NET_RX_BUSY_POLL)
4945 #define BUSY_POLL_BUDGET 8
4946 bool sk_busy_loop(struct sock *sk, int nonblock)
4947 {
4948         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4949         int (*busy_poll)(struct napi_struct *dev);
4950         struct napi_struct *napi;
4951         int rc = false;
4952
4953         rcu_read_lock();
4954
4955         napi = napi_by_id(sk->sk_napi_id);
4956         if (!napi)
4957                 goto out;
4958
4959         /* Note: ndo_busy_poll method is optional in linux-4.5 */
4960         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4961
4962         do {
4963                 rc = 0;
4964                 local_bh_disable();
4965                 if (busy_poll) {
4966                         rc = busy_poll(napi);
4967                 } else if (napi_schedule_prep(napi)) {
4968                         void *have = netpoll_poll_lock(napi);
4969
4970                         if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4971                                 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4972                                 trace_napi_poll(napi);
4973                                 if (rc == BUSY_POLL_BUDGET) {
4974                                         napi_complete_done(napi, rc);
4975                                         napi_schedule(napi);
4976                                 }
4977                         }
4978                         netpoll_poll_unlock(have);
4979                 }
4980                 if (rc > 0)
4981                         __NET_ADD_STATS(sock_net(sk),
4982                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4983                 local_bh_enable();
4984
4985                 if (rc == LL_FLUSH_FAILED)
4986                         break; /* permanent failure */
4987
4988                 cpu_relax();
4989         } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4990                  !need_resched() && !busy_loop_timeout(end_time));
4991
4992         rc = !skb_queue_empty(&sk->sk_receive_queue);
4993 out:
4994         rcu_read_unlock();
4995         return rc;
4996 }
4997 EXPORT_SYMBOL(sk_busy_loop);
4998
4999 #endif /* CONFIG_NET_RX_BUSY_POLL */
5000
5001 void napi_hash_add(struct napi_struct *napi)
5002 {
5003         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5004             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5005                 return;
5006
5007         spin_lock(&napi_hash_lock);
5008
5009         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5010         do {
5011                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5012                         napi_gen_id = NR_CPUS + 1;
5013         } while (napi_by_id(napi_gen_id));
5014         napi->napi_id = napi_gen_id;
5015
5016         hlist_add_head_rcu(&napi->napi_hash_node,
5017                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5018
5019         spin_unlock(&napi_hash_lock);
5020 }
5021 EXPORT_SYMBOL_GPL(napi_hash_add);
5022
5023 /* Warning : caller is responsible to make sure rcu grace period
5024  * is respected before freeing memory containing @napi
5025  */
5026 bool napi_hash_del(struct napi_struct *napi)
5027 {
5028         bool rcu_sync_needed = false;
5029
5030         spin_lock(&napi_hash_lock);
5031
5032         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5033                 rcu_sync_needed = true;
5034                 hlist_del_rcu(&napi->napi_hash_node);
5035         }
5036         spin_unlock(&napi_hash_lock);
5037         return rcu_sync_needed;
5038 }
5039 EXPORT_SYMBOL_GPL(napi_hash_del);
5040
5041 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5042 {
5043         struct napi_struct *napi;
5044
5045         napi = container_of(timer, struct napi_struct, timer);
5046         if (napi->gro_list)
5047                 napi_schedule(napi);
5048
5049         return HRTIMER_NORESTART;
5050 }
5051
5052 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5053                     int (*poll)(struct napi_struct *, int), int weight)
5054 {
5055         INIT_LIST_HEAD(&napi->poll_list);
5056         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5057         napi->timer.function = napi_watchdog;
5058         napi->gro_count = 0;
5059         napi->gro_list = NULL;
5060         napi->skb = NULL;
5061         napi->poll = poll;
5062         if (weight > NAPI_POLL_WEIGHT)
5063                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5064                             weight, dev->name);
5065         napi->weight = weight;
5066         list_add(&napi->dev_list, &dev->napi_list);
5067         napi->dev = dev;
5068 #ifdef CONFIG_NETPOLL
5069         spin_lock_init(&napi->poll_lock);
5070         napi->poll_owner = -1;
5071 #endif
5072         set_bit(NAPI_STATE_SCHED, &napi->state);
5073         napi_hash_add(napi);
5074 }
5075 EXPORT_SYMBOL(netif_napi_add);
5076
5077 void napi_disable(struct napi_struct *n)
5078 {
5079         might_sleep();
5080         set_bit(NAPI_STATE_DISABLE, &n->state);
5081
5082         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5083                 msleep(1);
5084         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5085                 msleep(1);
5086
5087         hrtimer_cancel(&n->timer);
5088
5089         clear_bit(NAPI_STATE_DISABLE, &n->state);
5090 }
5091 EXPORT_SYMBOL(napi_disable);
5092
5093 /* Must be called in process context */
5094 void netif_napi_del(struct napi_struct *napi)
5095 {
5096         might_sleep();
5097         if (napi_hash_del(napi))
5098                 synchronize_net();
5099         list_del_init(&napi->dev_list);
5100         napi_free_frags(napi);
5101
5102         kfree_skb_list(napi->gro_list);
5103         napi->gro_list = NULL;
5104         napi->gro_count = 0;
5105 }
5106 EXPORT_SYMBOL(netif_napi_del);
5107
5108 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5109 {
5110         void *have;
5111         int work, weight;
5112
5113         list_del_init(&n->poll_list);
5114
5115         have = netpoll_poll_lock(n);
5116
5117         weight = n->weight;
5118
5119         /* This NAPI_STATE_SCHED test is for avoiding a race
5120          * with netpoll's poll_napi().  Only the entity which
5121          * obtains the lock and sees NAPI_STATE_SCHED set will
5122          * actually make the ->poll() call.  Therefore we avoid
5123          * accidentally calling ->poll() when NAPI is not scheduled.
5124          */
5125         work = 0;
5126         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5127                 work = n->poll(n, weight);
5128                 trace_napi_poll(n);
5129         }
5130
5131         WARN_ON_ONCE(work > weight);
5132
5133         if (likely(work < weight))
5134                 goto out_unlock;
5135
5136         /* Drivers must not modify the NAPI state if they
5137          * consume the entire weight.  In such cases this code
5138          * still "owns" the NAPI instance and therefore can
5139          * move the instance around on the list at-will.
5140          */
5141         if (unlikely(napi_disable_pending(n))) {
5142                 napi_complete(n);
5143                 goto out_unlock;
5144         }
5145
5146         if (n->gro_list) {
5147                 /* flush too old packets
5148                  * If HZ < 1000, flush all packets.
5149                  */
5150                 napi_gro_flush(n, HZ >= 1000);
5151         }
5152
5153         /* Some drivers may have called napi_schedule
5154          * prior to exhausting their budget.
5155          */
5156         if (unlikely(!list_empty(&n->poll_list))) {
5157                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5158                              n->dev ? n->dev->name : "backlog");
5159                 goto out_unlock;
5160         }
5161
5162         list_add_tail(&n->poll_list, repoll);
5163
5164 out_unlock:
5165         netpoll_poll_unlock(have);
5166
5167         return work;
5168 }
5169
5170 static void net_rx_action(struct softirq_action *h)
5171 {
5172         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5173         unsigned long time_limit = jiffies + 2;
5174         int budget = netdev_budget;
5175         LIST_HEAD(list);
5176         LIST_HEAD(repoll);
5177
5178         local_irq_disable();
5179         list_splice_init(&sd->poll_list, &list);
5180         local_irq_enable();
5181
5182         for (;;) {
5183                 struct napi_struct *n;
5184
5185                 if (list_empty(&list)) {
5186                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5187                                 return;
5188                         break;
5189                 }
5190
5191                 n = list_first_entry(&list, struct napi_struct, poll_list);
5192                 budget -= napi_poll(n, &repoll);
5193
5194                 /* If softirq window is exhausted then punt.
5195                  * Allow this to run for 2 jiffies since which will allow
5196                  * an average latency of 1.5/HZ.
5197                  */
5198                 if (unlikely(budget <= 0 ||
5199                              time_after_eq(jiffies, time_limit))) {
5200                         sd->time_squeeze++;
5201                         break;
5202                 }
5203         }
5204
5205         __kfree_skb_flush();
5206         local_irq_disable();
5207
5208         list_splice_tail_init(&sd->poll_list, &list);
5209         list_splice_tail(&repoll, &list);
5210         list_splice(&list, &sd->poll_list);
5211         if (!list_empty(&sd->poll_list))
5212                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5213
5214         net_rps_action_and_irq_enable(sd);
5215 }
5216
5217 struct netdev_adjacent {
5218         struct net_device *dev;
5219
5220         /* upper master flag, there can only be one master device per list */
5221         bool master;
5222
5223         /* counter for the number of times this device was added to us */
5224         u16 ref_nr;
5225
5226         /* private field for the users */
5227         void *private;
5228
5229         struct list_head list;
5230         struct rcu_head rcu;
5231 };
5232
5233 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5234                                                  struct list_head *adj_list)
5235 {
5236         struct netdev_adjacent *adj;
5237
5238         list_for_each_entry(adj, adj_list, list) {
5239                 if (adj->dev == adj_dev)
5240                         return adj;
5241         }
5242         return NULL;
5243 }
5244
5245 /**
5246  * netdev_has_upper_dev - Check if device is linked to an upper device
5247  * @dev: device
5248  * @upper_dev: upper device to check
5249  *
5250  * Find out if a device is linked to specified upper device and return true
5251  * in case it is. Note that this checks only immediate upper device,
5252  * not through a complete stack of devices. The caller must hold the RTNL lock.
5253  */
5254 bool netdev_has_upper_dev(struct net_device *dev,
5255                           struct net_device *upper_dev)
5256 {
5257         ASSERT_RTNL();
5258
5259         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5260 }
5261 EXPORT_SYMBOL(netdev_has_upper_dev);
5262
5263 /**
5264  * netdev_has_any_upper_dev - Check if device is linked to some device
5265  * @dev: device
5266  *
5267  * Find out if a device is linked to an upper device and return true in case
5268  * it is. The caller must hold the RTNL lock.
5269  */
5270 static bool netdev_has_any_upper_dev(struct net_device *dev)
5271 {
5272         ASSERT_RTNL();
5273
5274         return !list_empty(&dev->all_adj_list.upper);
5275 }
5276
5277 /**
5278  * netdev_master_upper_dev_get - Get master upper device
5279  * @dev: device
5280  *
5281  * Find a master upper device and return pointer to it or NULL in case
5282  * it's not there. The caller must hold the RTNL lock.
5283  */
5284 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5285 {
5286         struct netdev_adjacent *upper;
5287
5288         ASSERT_RTNL();
5289
5290         if (list_empty(&dev->adj_list.upper))
5291                 return NULL;
5292
5293         upper = list_first_entry(&dev->adj_list.upper,
5294                                  struct netdev_adjacent, list);
5295         if (likely(upper->master))
5296                 return upper->dev;
5297         return NULL;
5298 }
5299 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5300
5301 void *netdev_adjacent_get_private(struct list_head *adj_list)
5302 {
5303         struct netdev_adjacent *adj;
5304
5305         adj = list_entry(adj_list, struct netdev_adjacent, list);
5306
5307         return adj->private;
5308 }
5309 EXPORT_SYMBOL(netdev_adjacent_get_private);
5310
5311 /**
5312  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5313  * @dev: device
5314  * @iter: list_head ** of the current position
5315  *
5316  * Gets the next device from the dev's upper list, starting from iter
5317  * position. The caller must hold RCU read lock.
5318  */
5319 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5320                                                  struct list_head **iter)
5321 {
5322         struct netdev_adjacent *upper;
5323
5324         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5325
5326         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5327
5328         if (&upper->list == &dev->adj_list.upper)
5329                 return NULL;
5330
5331         *iter = &upper->list;
5332
5333         return upper->dev;
5334 }
5335 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5336
5337 /**
5338  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5339  * @dev: device
5340  * @iter: list_head ** of the current position
5341  *
5342  * Gets the next device from the dev's upper list, starting from iter
5343  * position. The caller must hold RCU read lock.
5344  */
5345 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5346                                                      struct list_head **iter)
5347 {
5348         struct netdev_adjacent *upper;
5349
5350         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5351
5352         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5353
5354         if (&upper->list == &dev->all_adj_list.upper)
5355                 return NULL;
5356
5357         *iter = &upper->list;
5358
5359         return upper->dev;
5360 }
5361 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5362
5363 /**
5364  * netdev_lower_get_next_private - Get the next ->private from the
5365  *                                 lower neighbour list
5366  * @dev: device
5367  * @iter: list_head ** of the current position
5368  *
5369  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5370  * list, starting from iter position. The caller must hold either hold the
5371  * RTNL lock or its own locking that guarantees that the neighbour lower
5372  * list will remain unchanged.
5373  */
5374 void *netdev_lower_get_next_private(struct net_device *dev,
5375                                     struct list_head **iter)
5376 {
5377         struct netdev_adjacent *lower;
5378
5379         lower = list_entry(*iter, struct netdev_adjacent, list);
5380
5381         if (&lower->list == &dev->adj_list.lower)
5382                 return NULL;
5383
5384         *iter = lower->list.next;
5385
5386         return lower->private;
5387 }
5388 EXPORT_SYMBOL(netdev_lower_get_next_private);
5389
5390 /**
5391  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5392  *                                     lower neighbour list, RCU
5393  *                                     variant
5394  * @dev: device
5395  * @iter: list_head ** of the current position
5396  *
5397  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5398  * list, starting from iter position. The caller must hold RCU read lock.
5399  */
5400 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5401                                         struct list_head **iter)
5402 {
5403         struct netdev_adjacent *lower;
5404
5405         WARN_ON_ONCE(!rcu_read_lock_held());
5406
5407         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5408
5409         if (&lower->list == &dev->adj_list.lower)
5410                 return NULL;
5411
5412         *iter = &lower->list;
5413
5414         return lower->private;
5415 }
5416 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5417
5418 /**
5419  * netdev_lower_get_next - Get the next device from the lower neighbour
5420  *                         list
5421  * @dev: device
5422  * @iter: list_head ** of the current position
5423  *
5424  * Gets the next netdev_adjacent from the dev's lower neighbour
5425  * list, starting from iter position. The caller must hold RTNL lock or
5426  * its own locking that guarantees that the neighbour lower
5427  * list will remain unchanged.
5428  */
5429 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5430 {
5431         struct netdev_adjacent *lower;
5432
5433         lower = list_entry(*iter, struct netdev_adjacent, list);
5434
5435         if (&lower->list == &dev->adj_list.lower)
5436                 return NULL;
5437
5438         *iter = lower->list.next;
5439
5440         return lower->dev;
5441 }
5442 EXPORT_SYMBOL(netdev_lower_get_next);
5443
5444 /**
5445  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5446  *                                     lower neighbour list, RCU
5447  *                                     variant
5448  * @dev: device
5449  *
5450  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5451  * list. The caller must hold RCU read lock.
5452  */
5453 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5454 {
5455         struct netdev_adjacent *lower;
5456
5457         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5458                         struct netdev_adjacent, list);
5459         if (lower)
5460                 return lower->private;
5461         return NULL;
5462 }
5463 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5464
5465 /**
5466  * netdev_master_upper_dev_get_rcu - Get master upper device
5467  * @dev: device
5468  *
5469  * Find a master upper device and return pointer to it or NULL in case
5470  * it's not there. The caller must hold the RCU read lock.
5471  */
5472 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5473 {
5474         struct netdev_adjacent *upper;
5475
5476         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5477                                        struct netdev_adjacent, list);
5478         if (upper && likely(upper->master))
5479                 return upper->dev;
5480         return NULL;
5481 }
5482 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5483
5484 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5485                               struct net_device *adj_dev,
5486                               struct list_head *dev_list)
5487 {
5488         char linkname[IFNAMSIZ+7];
5489         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5490                 "upper_%s" : "lower_%s", adj_dev->name);
5491         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5492                                  linkname);
5493 }
5494 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5495                                char *name,
5496                                struct list_head *dev_list)
5497 {
5498         char linkname[IFNAMSIZ+7];
5499         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5500                 "upper_%s" : "lower_%s", name);
5501         sysfs_remove_link(&(dev->dev.kobj), linkname);
5502 }
5503
5504 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5505                                                  struct net_device *adj_dev,
5506                                                  struct list_head *dev_list)
5507 {
5508         return (dev_list == &dev->adj_list.upper ||
5509                 dev_list == &dev->adj_list.lower) &&
5510                 net_eq(dev_net(dev), dev_net(adj_dev));
5511 }
5512
5513 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5514                                         struct net_device *adj_dev,
5515                                         struct list_head *dev_list,
5516                                         void *private, bool master)
5517 {
5518         struct netdev_adjacent *adj;
5519         int ret;
5520
5521         adj = __netdev_find_adj(adj_dev, dev_list);
5522
5523         if (adj) {
5524                 adj->ref_nr++;
5525                 return 0;
5526         }
5527
5528         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5529         if (!adj)
5530                 return -ENOMEM;
5531
5532         adj->dev = adj_dev;
5533         adj->master = master;
5534         adj->ref_nr = 1;
5535         adj->private = private;
5536         dev_hold(adj_dev);
5537
5538         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5539                  adj_dev->name, dev->name, adj_dev->name);
5540
5541         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5542                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5543                 if (ret)
5544                         goto free_adj;
5545         }
5546
5547         /* Ensure that master link is always the first item in list. */
5548         if (master) {
5549                 ret = sysfs_create_link(&(dev->dev.kobj),
5550                                         &(adj_dev->dev.kobj), "master");
5551                 if (ret)
5552                         goto remove_symlinks;
5553
5554                 list_add_rcu(&adj->list, dev_list);
5555         } else {
5556                 list_add_tail_rcu(&adj->list, dev_list);
5557         }
5558
5559         return 0;
5560
5561 remove_symlinks:
5562         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5563                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5564 free_adj:
5565         kfree(adj);
5566         dev_put(adj_dev);
5567
5568         return ret;
5569 }
5570
5571 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5572                                          struct net_device *adj_dev,
5573                                          struct list_head *dev_list)
5574 {
5575         struct netdev_adjacent *adj;
5576
5577         adj = __netdev_find_adj(adj_dev, dev_list);
5578
5579         if (!adj) {
5580                 pr_err("tried to remove device %s from %s\n",
5581                        dev->name, adj_dev->name);
5582                 BUG();
5583         }
5584
5585         if (adj->ref_nr > 1) {
5586                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5587                          adj->ref_nr-1);
5588                 adj->ref_nr--;
5589                 return;
5590         }
5591
5592         if (adj->master)
5593                 sysfs_remove_link(&(dev->dev.kobj), "master");
5594
5595         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5596                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5597
5598         list_del_rcu(&adj->list);
5599         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5600                  adj_dev->name, dev->name, adj_dev->name);
5601         dev_put(adj_dev);
5602         kfree_rcu(adj, rcu);
5603 }
5604
5605 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5606                                             struct net_device *upper_dev,
5607                                             struct list_head *up_list,
5608                                             struct list_head *down_list,
5609                                             void *private, bool master)
5610 {
5611         int ret;
5612
5613         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5614                                            master);
5615         if (ret)
5616                 return ret;
5617
5618         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5619                                            false);
5620         if (ret) {
5621                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5622                 return ret;
5623         }
5624
5625         return 0;
5626 }
5627
5628 static int __netdev_adjacent_dev_link(struct net_device *dev,
5629                                       struct net_device *upper_dev)
5630 {
5631         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5632                                                 &dev->all_adj_list.upper,
5633                                                 &upper_dev->all_adj_list.lower,
5634                                                 NULL, false);
5635 }
5636
5637 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5638                                                struct net_device *upper_dev,
5639                                                struct list_head *up_list,
5640                                                struct list_head *down_list)
5641 {
5642         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5643         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5644 }
5645
5646 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5647                                          struct net_device *upper_dev)
5648 {
5649         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5650                                            &dev->all_adj_list.upper,
5651                                            &upper_dev->all_adj_list.lower);
5652 }
5653
5654 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5655                                                 struct net_device *upper_dev,
5656                                                 void *private, bool master)
5657 {
5658         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5659
5660         if (ret)
5661                 return ret;
5662
5663         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5664                                                &dev->adj_list.upper,
5665                                                &upper_dev->adj_list.lower,
5666                                                private, master);
5667         if (ret) {
5668                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5669                 return ret;
5670         }
5671
5672         return 0;
5673 }
5674
5675 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5676                                                    struct net_device *upper_dev)
5677 {
5678         __netdev_adjacent_dev_unlink(dev, upper_dev);
5679         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5680                                            &dev->adj_list.upper,
5681                                            &upper_dev->adj_list.lower);
5682 }
5683
5684 static int __netdev_upper_dev_link(struct net_device *dev,
5685                                    struct net_device *upper_dev, bool master,
5686                                    void *upper_priv, void *upper_info)
5687 {
5688         struct netdev_notifier_changeupper_info changeupper_info;
5689         struct netdev_adjacent *i, *j, *to_i, *to_j;
5690         int ret = 0;
5691
5692         ASSERT_RTNL();
5693
5694         if (dev == upper_dev)
5695                 return -EBUSY;
5696
5697         /* To prevent loops, check if dev is not upper device to upper_dev. */
5698         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5699                 return -EBUSY;
5700
5701         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5702                 return -EEXIST;
5703
5704         if (master && netdev_master_upper_dev_get(dev))
5705                 return -EBUSY;
5706
5707         changeupper_info.upper_dev = upper_dev;
5708         changeupper_info.master = master;
5709         changeupper_info.linking = true;
5710         changeupper_info.upper_info = upper_info;
5711
5712         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5713                                             &changeupper_info.info);
5714         ret = notifier_to_errno(ret);
5715         if (ret)
5716                 return ret;
5717
5718         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5719                                                    master);
5720         if (ret)
5721                 return ret;
5722
5723         /* Now that we linked these devs, make all the upper_dev's
5724          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5725          * versa, and don't forget the devices itself. All of these
5726          * links are non-neighbours.
5727          */
5728         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5729                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5730                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5731                                  i->dev->name, j->dev->name);
5732                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5733                         if (ret)
5734                                 goto rollback_mesh;
5735                 }
5736         }
5737
5738         /* add dev to every upper_dev's upper device */
5739         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5740                 pr_debug("linking %s's upper device %s with %s\n",
5741                          upper_dev->name, i->dev->name, dev->name);
5742                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5743                 if (ret)
5744                         goto rollback_upper_mesh;
5745         }
5746
5747         /* add upper_dev to every dev's lower device */
5748         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5749                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5750                          i->dev->name, upper_dev->name);
5751                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5752                 if (ret)
5753                         goto rollback_lower_mesh;
5754         }
5755
5756         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5757                                             &changeupper_info.info);
5758         ret = notifier_to_errno(ret);
5759         if (ret)
5760                 goto rollback_lower_mesh;
5761
5762         return 0;
5763
5764 rollback_lower_mesh:
5765         to_i = i;
5766         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5767                 if (i == to_i)
5768                         break;
5769                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5770         }
5771
5772         i = NULL;
5773
5774 rollback_upper_mesh:
5775         to_i = i;
5776         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5777                 if (i == to_i)
5778                         break;
5779                 __netdev_adjacent_dev_unlink(dev, i->dev);
5780         }
5781
5782         i = j = NULL;
5783
5784 rollback_mesh:
5785         to_i = i;
5786         to_j = j;
5787         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5788                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5789                         if (i == to_i && j == to_j)
5790                                 break;
5791                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5792                 }
5793                 if (i == to_i)
5794                         break;
5795         }
5796
5797         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5798
5799         return ret;
5800 }
5801
5802 /**
5803  * netdev_upper_dev_link - Add a link to the upper device
5804  * @dev: device
5805  * @upper_dev: new upper device
5806  *
5807  * Adds a link to device which is upper to this one. The caller must hold
5808  * the RTNL lock. On a failure a negative errno code is returned.
5809  * On success the reference counts are adjusted and the function
5810  * returns zero.
5811  */
5812 int netdev_upper_dev_link(struct net_device *dev,
5813                           struct net_device *upper_dev)
5814 {
5815         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5816 }
5817 EXPORT_SYMBOL(netdev_upper_dev_link);
5818
5819 /**
5820  * netdev_master_upper_dev_link - Add a master link to the upper device
5821  * @dev: device
5822  * @upper_dev: new upper device
5823  * @upper_priv: upper device private
5824  * @upper_info: upper info to be passed down via notifier
5825  *
5826  * Adds a link to device which is upper to this one. In this case, only
5827  * one master upper device can be linked, although other non-master devices
5828  * might be linked as well. The caller must hold the RTNL lock.
5829  * On a failure a negative errno code is returned. On success the reference
5830  * counts are adjusted and the function returns zero.
5831  */
5832 int netdev_master_upper_dev_link(struct net_device *dev,
5833                                  struct net_device *upper_dev,
5834                                  void *upper_priv, void *upper_info)
5835 {
5836         return __netdev_upper_dev_link(dev, upper_dev, true,
5837                                        upper_priv, upper_info);
5838 }
5839 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5840
5841 /**
5842  * netdev_upper_dev_unlink - Removes a link to upper device
5843  * @dev: device
5844  * @upper_dev: new upper device
5845  *
5846  * Removes a link to device which is upper to this one. The caller must hold
5847  * the RTNL lock.
5848  */
5849 void netdev_upper_dev_unlink(struct net_device *dev,
5850                              struct net_device *upper_dev)
5851 {
5852         struct netdev_notifier_changeupper_info changeupper_info;
5853         struct netdev_adjacent *i, *j;
5854         ASSERT_RTNL();
5855
5856         changeupper_info.upper_dev = upper_dev;
5857         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5858         changeupper_info.linking = false;
5859
5860         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5861                                       &changeupper_info.info);
5862
5863         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5864
5865         /* Here is the tricky part. We must remove all dev's lower
5866          * devices from all upper_dev's upper devices and vice
5867          * versa, to maintain the graph relationship.
5868          */
5869         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5870                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5871                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5872
5873         /* remove also the devices itself from lower/upper device
5874          * list
5875          */
5876         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5877                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5878
5879         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5880                 __netdev_adjacent_dev_unlink(dev, i->dev);
5881
5882         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5883                                       &changeupper_info.info);
5884 }
5885 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5886
5887 /**
5888  * netdev_bonding_info_change - Dispatch event about slave change
5889  * @dev: device
5890  * @bonding_info: info to dispatch
5891  *
5892  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5893  * The caller must hold the RTNL lock.
5894  */
5895 void netdev_bonding_info_change(struct net_device *dev,
5896                                 struct netdev_bonding_info *bonding_info)
5897 {
5898         struct netdev_notifier_bonding_info     info;
5899
5900         memcpy(&info.bonding_info, bonding_info,
5901                sizeof(struct netdev_bonding_info));
5902         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5903                                       &info.info);
5904 }
5905 EXPORT_SYMBOL(netdev_bonding_info_change);
5906
5907 static void netdev_adjacent_add_links(struct net_device *dev)
5908 {
5909         struct netdev_adjacent *iter;
5910
5911         struct net *net = dev_net(dev);
5912
5913         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5914                 if (!net_eq(net,dev_net(iter->dev)))
5915                         continue;
5916                 netdev_adjacent_sysfs_add(iter->dev, dev,
5917                                           &iter->dev->adj_list.lower);
5918                 netdev_adjacent_sysfs_add(dev, iter->dev,
5919                                           &dev->adj_list.upper);
5920         }
5921
5922         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5923                 if (!net_eq(net,dev_net(iter->dev)))
5924                         continue;
5925                 netdev_adjacent_sysfs_add(iter->dev, dev,
5926                                           &iter->dev->adj_list.upper);
5927                 netdev_adjacent_sysfs_add(dev, iter->dev,
5928                                           &dev->adj_list.lower);
5929         }
5930 }
5931
5932 static void netdev_adjacent_del_links(struct net_device *dev)
5933 {
5934         struct netdev_adjacent *iter;
5935
5936         struct net *net = dev_net(dev);
5937
5938         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5939                 if (!net_eq(net,dev_net(iter->dev)))
5940                         continue;
5941                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5942                                           &iter->dev->adj_list.lower);
5943                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5944                                           &dev->adj_list.upper);
5945         }
5946
5947         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5948                 if (!net_eq(net,dev_net(iter->dev)))
5949                         continue;
5950                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5951                                           &iter->dev->adj_list.upper);
5952                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5953                                           &dev->adj_list.lower);
5954         }
5955 }
5956
5957 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5958 {
5959         struct netdev_adjacent *iter;
5960
5961         struct net *net = dev_net(dev);
5962
5963         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5964                 if (!net_eq(net,dev_net(iter->dev)))
5965                         continue;
5966                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5967                                           &iter->dev->adj_list.lower);
5968                 netdev_adjacent_sysfs_add(iter->dev, dev,
5969                                           &iter->dev->adj_list.lower);
5970         }
5971
5972         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5973                 if (!net_eq(net,dev_net(iter->dev)))
5974                         continue;
5975                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5976                                           &iter->dev->adj_list.upper);
5977                 netdev_adjacent_sysfs_add(iter->dev, dev,
5978                                           &iter->dev->adj_list.upper);
5979         }
5980 }
5981
5982 void *netdev_lower_dev_get_private(struct net_device *dev,
5983                                    struct net_device *lower_dev)
5984 {
5985         struct netdev_adjacent *lower;
5986
5987         if (!lower_dev)
5988                 return NULL;
5989         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5990         if (!lower)
5991                 return NULL;
5992
5993         return lower->private;
5994 }
5995 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5996
5997
5998 int dev_get_nest_level(struct net_device *dev,
5999                        bool (*type_check)(const struct net_device *dev))
6000 {
6001         struct net_device *lower = NULL;
6002         struct list_head *iter;
6003         int max_nest = -1;
6004         int nest;
6005
6006         ASSERT_RTNL();
6007
6008         netdev_for_each_lower_dev(dev, lower, iter) {
6009                 nest = dev_get_nest_level(lower, type_check);
6010                 if (max_nest < nest)
6011                         max_nest = nest;
6012         }
6013
6014         if (type_check(dev))
6015                 max_nest++;
6016
6017         return max_nest;
6018 }
6019 EXPORT_SYMBOL(dev_get_nest_level);
6020
6021 /**
6022  * netdev_lower_change - Dispatch event about lower device state change
6023  * @lower_dev: device
6024  * @lower_state_info: state to dispatch
6025  *
6026  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6027  * The caller must hold the RTNL lock.
6028  */
6029 void netdev_lower_state_changed(struct net_device *lower_dev,
6030                                 void *lower_state_info)
6031 {
6032         struct netdev_notifier_changelowerstate_info changelowerstate_info;
6033
6034         ASSERT_RTNL();
6035         changelowerstate_info.lower_state_info = lower_state_info;
6036         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6037                                       &changelowerstate_info.info);
6038 }
6039 EXPORT_SYMBOL(netdev_lower_state_changed);
6040
6041 static void dev_change_rx_flags(struct net_device *dev, int flags)
6042 {
6043         const struct net_device_ops *ops = dev->netdev_ops;
6044
6045         if (ops->ndo_change_rx_flags)
6046                 ops->ndo_change_rx_flags(dev, flags);
6047 }
6048
6049 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6050 {
6051         unsigned int old_flags = dev->flags;
6052         kuid_t uid;
6053         kgid_t gid;
6054
6055         ASSERT_RTNL();
6056
6057         dev->flags |= IFF_PROMISC;
6058         dev->promiscuity += inc;
6059         if (dev->promiscuity == 0) {
6060                 /*
6061                  * Avoid overflow.
6062                  * If inc causes overflow, untouch promisc and return error.
6063                  */
6064                 if (inc < 0)
6065                         dev->flags &= ~IFF_PROMISC;
6066                 else {
6067                         dev->promiscuity -= inc;
6068                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6069                                 dev->name);
6070                         return -EOVERFLOW;
6071                 }
6072         }
6073         if (dev->flags != old_flags) {
6074                 pr_info("device %s %s promiscuous mode\n",
6075                         dev->name,
6076                         dev->flags & IFF_PROMISC ? "entered" : "left");
6077                 if (audit_enabled) {
6078                         current_uid_gid(&uid, &gid);
6079                         audit_log(current->audit_context, GFP_ATOMIC,
6080                                 AUDIT_ANOM_PROMISCUOUS,
6081                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6082                                 dev->name, (dev->flags & IFF_PROMISC),
6083                                 (old_flags & IFF_PROMISC),
6084                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6085                                 from_kuid(&init_user_ns, uid),
6086                                 from_kgid(&init_user_ns, gid),
6087                                 audit_get_sessionid(current));
6088                 }
6089
6090                 dev_change_rx_flags(dev, IFF_PROMISC);
6091         }
6092         if (notify)
6093                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6094         return 0;
6095 }
6096
6097 /**
6098  *      dev_set_promiscuity     - update promiscuity count on a device
6099  *      @dev: device
6100  *      @inc: modifier
6101  *
6102  *      Add or remove promiscuity from a device. While the count in the device
6103  *      remains above zero the interface remains promiscuous. Once it hits zero
6104  *      the device reverts back to normal filtering operation. A negative inc
6105  *      value is used to drop promiscuity on the device.
6106  *      Return 0 if successful or a negative errno code on error.
6107  */
6108 int dev_set_promiscuity(struct net_device *dev, int inc)
6109 {
6110         unsigned int old_flags = dev->flags;
6111         int err;
6112
6113         err = __dev_set_promiscuity(dev, inc, true);
6114         if (err < 0)
6115                 return err;
6116         if (dev->flags != old_flags)
6117                 dev_set_rx_mode(dev);
6118         return err;
6119 }
6120 EXPORT_SYMBOL(dev_set_promiscuity);
6121
6122 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6123 {
6124         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6125
6126         ASSERT_RTNL();
6127
6128         dev->flags |= IFF_ALLMULTI;
6129         dev->allmulti += inc;
6130         if (dev->allmulti == 0) {
6131                 /*
6132                  * Avoid overflow.
6133                  * If inc causes overflow, untouch allmulti and return error.
6134                  */
6135                 if (inc < 0)
6136                         dev->flags &= ~IFF_ALLMULTI;
6137                 else {
6138                         dev->allmulti -= inc;
6139                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6140                                 dev->name);
6141                         return -EOVERFLOW;
6142                 }
6143         }
6144         if (dev->flags ^ old_flags) {
6145                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6146                 dev_set_rx_mode(dev);
6147                 if (notify)
6148                         __dev_notify_flags(dev, old_flags,
6149                                            dev->gflags ^ old_gflags);
6150         }
6151         return 0;
6152 }
6153
6154 /**
6155  *      dev_set_allmulti        - update allmulti count on a device
6156  *      @dev: device
6157  *      @inc: modifier
6158  *
6159  *      Add or remove reception of all multicast frames to a device. While the
6160  *      count in the device remains above zero the interface remains listening
6161  *      to all interfaces. Once it hits zero the device reverts back to normal
6162  *      filtering operation. A negative @inc value is used to drop the counter
6163  *      when releasing a resource needing all multicasts.
6164  *      Return 0 if successful or a negative errno code on error.
6165  */
6166
6167 int dev_set_allmulti(struct net_device *dev, int inc)
6168 {
6169         return __dev_set_allmulti(dev, inc, true);
6170 }
6171 EXPORT_SYMBOL(dev_set_allmulti);
6172
6173 /*
6174  *      Upload unicast and multicast address lists to device and
6175  *      configure RX filtering. When the device doesn't support unicast
6176  *      filtering it is put in promiscuous mode while unicast addresses
6177  *      are present.
6178  */
6179 void __dev_set_rx_mode(struct net_device *dev)
6180 {
6181         const struct net_device_ops *ops = dev->netdev_ops;
6182
6183         /* dev_open will call this function so the list will stay sane. */
6184         if (!(dev->flags&IFF_UP))
6185                 return;
6186
6187         if (!netif_device_present(dev))
6188                 return;
6189
6190         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6191                 /* Unicast addresses changes may only happen under the rtnl,
6192                  * therefore calling __dev_set_promiscuity here is safe.
6193                  */
6194                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6195                         __dev_set_promiscuity(dev, 1, false);
6196                         dev->uc_promisc = true;
6197                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6198                         __dev_set_promiscuity(dev, -1, false);
6199                         dev->uc_promisc = false;
6200                 }
6201         }
6202
6203         if (ops->ndo_set_rx_mode)
6204                 ops->ndo_set_rx_mode(dev);
6205 }
6206
6207 void dev_set_rx_mode(struct net_device *dev)
6208 {
6209         netif_addr_lock_bh(dev);
6210         __dev_set_rx_mode(dev);
6211         netif_addr_unlock_bh(dev);
6212 }
6213
6214 /**
6215  *      dev_get_flags - get flags reported to userspace
6216  *      @dev: device
6217  *
6218  *      Get the combination of flag bits exported through APIs to userspace.
6219  */
6220 unsigned int dev_get_flags(const struct net_device *dev)
6221 {
6222         unsigned int flags;
6223
6224         flags = (dev->flags & ~(IFF_PROMISC |
6225                                 IFF_ALLMULTI |
6226                                 IFF_RUNNING |
6227                                 IFF_LOWER_UP |
6228                                 IFF_DORMANT)) |
6229                 (dev->gflags & (IFF_PROMISC |
6230                                 IFF_ALLMULTI));
6231
6232         if (netif_running(dev)) {
6233                 if (netif_oper_up(dev))
6234                         flags |= IFF_RUNNING;
6235                 if (netif_carrier_ok(dev))
6236                         flags |= IFF_LOWER_UP;
6237                 if (netif_dormant(dev))
6238                         flags |= IFF_DORMANT;
6239         }
6240
6241         return flags;
6242 }
6243 EXPORT_SYMBOL(dev_get_flags);
6244
6245 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6246 {
6247         unsigned int old_flags = dev->flags;
6248         int ret;
6249
6250         ASSERT_RTNL();
6251
6252         /*
6253          *      Set the flags on our device.
6254          */
6255
6256         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6257                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6258                                IFF_AUTOMEDIA)) |
6259                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6260                                     IFF_ALLMULTI));
6261
6262         /*
6263          *      Load in the correct multicast list now the flags have changed.
6264          */
6265
6266         if ((old_flags ^ flags) & IFF_MULTICAST)
6267                 dev_change_rx_flags(dev, IFF_MULTICAST);
6268
6269         dev_set_rx_mode(dev);
6270
6271         /*
6272          *      Have we downed the interface. We handle IFF_UP ourselves
6273          *      according to user attempts to set it, rather than blindly
6274          *      setting it.
6275          */
6276
6277         ret = 0;
6278         if ((old_flags ^ flags) & IFF_UP)
6279                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6280
6281         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6282                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6283                 unsigned int old_flags = dev->flags;
6284
6285                 dev->gflags ^= IFF_PROMISC;
6286
6287                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6288                         if (dev->flags != old_flags)
6289                                 dev_set_rx_mode(dev);
6290         }
6291
6292         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6293            is important. Some (broken) drivers set IFF_PROMISC, when
6294            IFF_ALLMULTI is requested not asking us and not reporting.
6295          */
6296         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6297                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6298
6299                 dev->gflags ^= IFF_ALLMULTI;
6300                 __dev_set_allmulti(dev, inc, false);
6301         }
6302
6303         return ret;
6304 }
6305
6306 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6307                         unsigned int gchanges)
6308 {
6309         unsigned int changes = dev->flags ^ old_flags;
6310
6311         if (gchanges)
6312                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6313
6314         if (changes & IFF_UP) {
6315                 if (dev->flags & IFF_UP)
6316                         call_netdevice_notifiers(NETDEV_UP, dev);
6317                 else
6318                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6319         }
6320
6321         if (dev->flags & IFF_UP &&
6322             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6323                 struct netdev_notifier_change_info change_info;
6324
6325                 change_info.flags_changed = changes;
6326                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6327                                               &change_info.info);
6328         }
6329 }
6330
6331 /**
6332  *      dev_change_flags - change device settings
6333  *      @dev: device
6334  *      @flags: device state flags
6335  *
6336  *      Change settings on device based state flags. The flags are
6337  *      in the userspace exported format.
6338  */
6339 int dev_change_flags(struct net_device *dev, unsigned int flags)
6340 {
6341         int ret;
6342         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6343
6344         ret = __dev_change_flags(dev, flags);
6345         if (ret < 0)
6346                 return ret;
6347
6348         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6349         __dev_notify_flags(dev, old_flags, changes);
6350         return ret;
6351 }
6352 EXPORT_SYMBOL(dev_change_flags);
6353
6354 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6355 {
6356         const struct net_device_ops *ops = dev->netdev_ops;
6357
6358         if (ops->ndo_change_mtu)
6359                 return ops->ndo_change_mtu(dev, new_mtu);
6360
6361         dev->mtu = new_mtu;
6362         return 0;
6363 }
6364
6365 /**
6366  *      dev_set_mtu - Change maximum transfer unit
6367  *      @dev: device
6368  *      @new_mtu: new transfer unit
6369  *
6370  *      Change the maximum transfer size of the network device.
6371  */
6372 int dev_set_mtu(struct net_device *dev, int new_mtu)
6373 {
6374         int err, orig_mtu;
6375
6376         if (new_mtu == dev->mtu)
6377                 return 0;
6378
6379         /*      MTU must be positive.    */
6380         if (new_mtu < 0)
6381                 return -EINVAL;
6382
6383         if (!netif_device_present(dev))
6384                 return -ENODEV;
6385
6386         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6387         err = notifier_to_errno(err);
6388         if (err)
6389                 return err;
6390
6391         orig_mtu = dev->mtu;
6392         err = __dev_set_mtu(dev, new_mtu);
6393
6394         if (!err) {
6395                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6396                 err = notifier_to_errno(err);
6397                 if (err) {
6398                         /* setting mtu back and notifying everyone again,
6399                          * so that they have a chance to revert changes.
6400                          */
6401                         __dev_set_mtu(dev, orig_mtu);
6402                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6403                 }
6404         }
6405         return err;
6406 }
6407 EXPORT_SYMBOL(dev_set_mtu);
6408
6409 /**
6410  *      dev_set_group - Change group this device belongs to
6411  *      @dev: device
6412  *      @new_group: group this device should belong to
6413  */
6414 void dev_set_group(struct net_device *dev, int new_group)
6415 {
6416         dev->group = new_group;
6417 }
6418 EXPORT_SYMBOL(dev_set_group);
6419
6420 /**
6421  *      dev_set_mac_address - Change Media Access Control Address
6422  *      @dev: device
6423  *      @sa: new address
6424  *
6425  *      Change the hardware (MAC) address of the device
6426  */
6427 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6428 {
6429         const struct net_device_ops *ops = dev->netdev_ops;
6430         int err;
6431
6432         if (!ops->ndo_set_mac_address)
6433                 return -EOPNOTSUPP;
6434         if (sa->sa_family != dev->type)
6435                 return -EINVAL;
6436         if (!netif_device_present(dev))
6437                 return -ENODEV;
6438         err = ops->ndo_set_mac_address(dev, sa);
6439         if (err)
6440                 return err;
6441         dev->addr_assign_type = NET_ADDR_SET;
6442         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6443         add_device_randomness(dev->dev_addr, dev->addr_len);
6444         return 0;
6445 }
6446 EXPORT_SYMBOL(dev_set_mac_address);
6447
6448 /**
6449  *      dev_change_carrier - Change device carrier
6450  *      @dev: device
6451  *      @new_carrier: new value
6452  *
6453  *      Change device carrier
6454  */
6455 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6456 {
6457         const struct net_device_ops *ops = dev->netdev_ops;
6458
6459         if (!ops->ndo_change_carrier)
6460                 return -EOPNOTSUPP;
6461         if (!netif_device_present(dev))
6462                 return -ENODEV;
6463         return ops->ndo_change_carrier(dev, new_carrier);
6464 }
6465 EXPORT_SYMBOL(dev_change_carrier);
6466
6467 /**
6468  *      dev_get_phys_port_id - Get device physical port ID
6469  *      @dev: device
6470  *      @ppid: port ID
6471  *
6472  *      Get device physical port ID
6473  */
6474 int dev_get_phys_port_id(struct net_device *dev,
6475                          struct netdev_phys_item_id *ppid)
6476 {
6477         const struct net_device_ops *ops = dev->netdev_ops;
6478
6479         if (!ops->ndo_get_phys_port_id)
6480                 return -EOPNOTSUPP;
6481         return ops->ndo_get_phys_port_id(dev, ppid);
6482 }
6483 EXPORT_SYMBOL(dev_get_phys_port_id);
6484
6485 /**
6486  *      dev_get_phys_port_name - Get device physical port name
6487  *      @dev: device
6488  *      @name: port name
6489  *      @len: limit of bytes to copy to name
6490  *
6491  *      Get device physical port name
6492  */
6493 int dev_get_phys_port_name(struct net_device *dev,
6494                            char *name, size_t len)
6495 {
6496         const struct net_device_ops *ops = dev->netdev_ops;
6497
6498         if (!ops->ndo_get_phys_port_name)
6499                 return -EOPNOTSUPP;
6500         return ops->ndo_get_phys_port_name(dev, name, len);
6501 }
6502 EXPORT_SYMBOL(dev_get_phys_port_name);
6503
6504 /**
6505  *      dev_change_proto_down - update protocol port state information
6506  *      @dev: device
6507  *      @proto_down: new value
6508  *
6509  *      This info can be used by switch drivers to set the phys state of the
6510  *      port.
6511  */
6512 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6513 {
6514         const struct net_device_ops *ops = dev->netdev_ops;
6515
6516         if (!ops->ndo_change_proto_down)
6517                 return -EOPNOTSUPP;
6518         if (!netif_device_present(dev))
6519                 return -ENODEV;
6520         return ops->ndo_change_proto_down(dev, proto_down);
6521 }
6522 EXPORT_SYMBOL(dev_change_proto_down);
6523
6524 /**
6525  *      dev_new_index   -       allocate an ifindex
6526  *      @net: the applicable net namespace
6527  *
6528  *      Returns a suitable unique value for a new device interface
6529  *      number.  The caller must hold the rtnl semaphore or the
6530  *      dev_base_lock to be sure it remains unique.
6531  */
6532 static int dev_new_index(struct net *net)
6533 {
6534         int ifindex = net->ifindex;
6535         for (;;) {
6536                 if (++ifindex <= 0)
6537                         ifindex = 1;
6538                 if (!__dev_get_by_index(net, ifindex))
6539                         return net->ifindex = ifindex;
6540         }
6541 }
6542
6543 /* Delayed registration/unregisteration */
6544 static LIST_HEAD(net_todo_list);
6545 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6546
6547 static void net_set_todo(struct net_device *dev)
6548 {
6549         list_add_tail(&dev->todo_list, &net_todo_list);
6550         dev_net(dev)->dev_unreg_count++;
6551 }
6552
6553 static void rollback_registered_many(struct list_head *head)
6554 {
6555         struct net_device *dev, *tmp;
6556         LIST_HEAD(close_head);
6557
6558         BUG_ON(dev_boot_phase);
6559         ASSERT_RTNL();
6560
6561         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6562                 /* Some devices call without registering
6563                  * for initialization unwind. Remove those
6564                  * devices and proceed with the remaining.
6565                  */
6566                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6567                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6568                                  dev->name, dev);
6569
6570                         WARN_ON(1);
6571                         list_del(&dev->unreg_list);
6572                         continue;
6573                 }
6574                 dev->dismantle = true;
6575                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6576         }
6577
6578         /* If device is running, close it first. */
6579         list_for_each_entry(dev, head, unreg_list)
6580                 list_add_tail(&dev->close_list, &close_head);
6581         dev_close_many(&close_head, true);
6582
6583         list_for_each_entry(dev, head, unreg_list) {
6584                 /* And unlink it from device chain. */
6585                 unlist_netdevice(dev);
6586
6587                 dev->reg_state = NETREG_UNREGISTERING;
6588                 on_each_cpu(flush_backlog, dev, 1);
6589         }
6590
6591         synchronize_net();
6592
6593         list_for_each_entry(dev, head, unreg_list) {
6594                 struct sk_buff *skb = NULL;
6595
6596                 /* Shutdown queueing discipline. */
6597                 dev_shutdown(dev);
6598
6599
6600                 /* Notify protocols, that we are about to destroy
6601                    this device. They should clean all the things.
6602                 */
6603                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6604
6605                 if (!dev->rtnl_link_ops ||
6606                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6607                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6608                                                      GFP_KERNEL);
6609
6610                 /*
6611                  *      Flush the unicast and multicast chains
6612                  */
6613                 dev_uc_flush(dev);
6614                 dev_mc_flush(dev);
6615
6616                 if (dev->netdev_ops->ndo_uninit)
6617                         dev->netdev_ops->ndo_uninit(dev);
6618
6619                 if (skb)
6620                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6621
6622                 /* Notifier chain MUST detach us all upper devices. */
6623                 WARN_ON(netdev_has_any_upper_dev(dev));
6624
6625                 /* Remove entries from kobject tree */
6626                 netdev_unregister_kobject(dev);
6627 #ifdef CONFIG_XPS
6628                 /* Remove XPS queueing entries */
6629                 netif_reset_xps_queues_gt(dev, 0);
6630 #endif
6631         }
6632
6633         synchronize_net();
6634
6635         list_for_each_entry(dev, head, unreg_list)
6636                 dev_put(dev);
6637 }
6638
6639 static void rollback_registered(struct net_device *dev)
6640 {
6641         LIST_HEAD(single);
6642
6643         list_add(&dev->unreg_list, &single);
6644         rollback_registered_many(&single);
6645         list_del(&single);
6646 }
6647
6648 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6649         struct net_device *upper, netdev_features_t features)
6650 {
6651         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6652         netdev_features_t feature;
6653         int feature_bit;
6654
6655         for_each_netdev_feature(&upper_disables, feature_bit) {
6656                 feature = __NETIF_F_BIT(feature_bit);
6657                 if (!(upper->wanted_features & feature)
6658                     && (features & feature)) {
6659                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6660                                    &feature, upper->name);
6661                         features &= ~feature;
6662                 }
6663         }
6664
6665         return features;
6666 }
6667
6668 static void netdev_sync_lower_features(struct net_device *upper,
6669         struct net_device *lower, netdev_features_t features)
6670 {
6671         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6672         netdev_features_t feature;
6673         int feature_bit;
6674
6675         for_each_netdev_feature(&upper_disables, feature_bit) {
6676                 feature = __NETIF_F_BIT(feature_bit);
6677                 if (!(features & feature) && (lower->features & feature)) {
6678                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6679                                    &feature, lower->name);
6680                         lower->wanted_features &= ~feature;
6681                         netdev_update_features(lower);
6682
6683                         if (unlikely(lower->features & feature))
6684                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6685                                             &feature, lower->name);
6686                 }
6687         }
6688 }
6689
6690 static netdev_features_t netdev_fix_features(struct net_device *dev,
6691         netdev_features_t features)
6692 {
6693         /* Fix illegal checksum combinations */
6694         if ((features & NETIF_F_HW_CSUM) &&
6695             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6696                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6697                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6698         }
6699
6700         /* TSO requires that SG is present as well. */
6701         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6702                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6703                 features &= ~NETIF_F_ALL_TSO;
6704         }
6705
6706         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6707                                         !(features & NETIF_F_IP_CSUM)) {
6708                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6709                 features &= ~NETIF_F_TSO;
6710                 features &= ~NETIF_F_TSO_ECN;
6711         }
6712
6713         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6714                                          !(features & NETIF_F_IPV6_CSUM)) {
6715                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6716                 features &= ~NETIF_F_TSO6;
6717         }
6718
6719         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6720         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6721                 features &= ~NETIF_F_TSO_MANGLEID;
6722
6723         /* TSO ECN requires that TSO is present as well. */
6724         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6725                 features &= ~NETIF_F_TSO_ECN;
6726
6727         /* Software GSO depends on SG. */
6728         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6729                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6730                 features &= ~NETIF_F_GSO;
6731         }
6732
6733         /* UFO needs SG and checksumming */
6734         if (features & NETIF_F_UFO) {
6735                 /* maybe split UFO into V4 and V6? */
6736                 if (!(features & NETIF_F_HW_CSUM) &&
6737                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6738                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6739                         netdev_dbg(dev,
6740                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6741                         features &= ~NETIF_F_UFO;
6742                 }
6743
6744                 if (!(features & NETIF_F_SG)) {
6745                         netdev_dbg(dev,
6746                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6747                         features &= ~NETIF_F_UFO;
6748                 }
6749         }
6750
6751         /* GSO partial features require GSO partial be set */
6752         if ((features & dev->gso_partial_features) &&
6753             !(features & NETIF_F_GSO_PARTIAL)) {
6754                 netdev_dbg(dev,
6755                            "Dropping partially supported GSO features since no GSO partial.\n");
6756                 features &= ~dev->gso_partial_features;
6757         }
6758
6759 #ifdef CONFIG_NET_RX_BUSY_POLL
6760         if (dev->netdev_ops->ndo_busy_poll)
6761                 features |= NETIF_F_BUSY_POLL;
6762         else
6763 #endif
6764                 features &= ~NETIF_F_BUSY_POLL;
6765
6766         return features;
6767 }
6768
6769 int __netdev_update_features(struct net_device *dev)
6770 {
6771         struct net_device *upper, *lower;
6772         netdev_features_t features;
6773         struct list_head *iter;
6774         int err = -1;
6775
6776         ASSERT_RTNL();
6777
6778         features = netdev_get_wanted_features(dev);
6779
6780         if (dev->netdev_ops->ndo_fix_features)
6781                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6782
6783         /* driver might be less strict about feature dependencies */
6784         features = netdev_fix_features(dev, features);
6785
6786         /* some features can't be enabled if they're off an an upper device */
6787         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6788                 features = netdev_sync_upper_features(dev, upper, features);
6789
6790         if (dev->features == features)
6791                 goto sync_lower;
6792
6793         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6794                 &dev->features, &features);
6795
6796         if (dev->netdev_ops->ndo_set_features)
6797                 err = dev->netdev_ops->ndo_set_features(dev, features);
6798         else
6799                 err = 0;
6800
6801         if (unlikely(err < 0)) {
6802                 netdev_err(dev,
6803                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6804                         err, &features, &dev->features);
6805                 /* return non-0 since some features might have changed and
6806                  * it's better to fire a spurious notification than miss it
6807                  */
6808                 return -1;
6809         }
6810
6811 sync_lower:
6812         /* some features must be disabled on lower devices when disabled
6813          * on an upper device (think: bonding master or bridge)
6814          */
6815         netdev_for_each_lower_dev(dev, lower, iter)
6816                 netdev_sync_lower_features(dev, lower, features);
6817
6818         if (!err)
6819                 dev->features = features;
6820
6821         return err < 0 ? 0 : 1;
6822 }
6823
6824 /**
6825  *      netdev_update_features - recalculate device features
6826  *      @dev: the device to check
6827  *
6828  *      Recalculate dev->features set and send notifications if it
6829  *      has changed. Should be called after driver or hardware dependent
6830  *      conditions might have changed that influence the features.
6831  */
6832 void netdev_update_features(struct net_device *dev)
6833 {
6834         if (__netdev_update_features(dev))
6835                 netdev_features_change(dev);
6836 }
6837 EXPORT_SYMBOL(netdev_update_features);
6838
6839 /**
6840  *      netdev_change_features - recalculate device features
6841  *      @dev: the device to check
6842  *
6843  *      Recalculate dev->features set and send notifications even
6844  *      if they have not changed. Should be called instead of
6845  *      netdev_update_features() if also dev->vlan_features might
6846  *      have changed to allow the changes to be propagated to stacked
6847  *      VLAN devices.
6848  */
6849 void netdev_change_features(struct net_device *dev)
6850 {
6851         __netdev_update_features(dev);
6852         netdev_features_change(dev);
6853 }
6854 EXPORT_SYMBOL(netdev_change_features);
6855
6856 /**
6857  *      netif_stacked_transfer_operstate -      transfer operstate
6858  *      @rootdev: the root or lower level device to transfer state from
6859  *      @dev: the device to transfer operstate to
6860  *
6861  *      Transfer operational state from root to device. This is normally
6862  *      called when a stacking relationship exists between the root
6863  *      device and the device(a leaf device).
6864  */
6865 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6866                                         struct net_device *dev)
6867 {
6868         if (rootdev->operstate == IF_OPER_DORMANT)
6869                 netif_dormant_on(dev);
6870         else
6871                 netif_dormant_off(dev);
6872
6873         if (netif_carrier_ok(rootdev)) {
6874                 if (!netif_carrier_ok(dev))
6875                         netif_carrier_on(dev);
6876         } else {
6877                 if (netif_carrier_ok(dev))
6878                         netif_carrier_off(dev);
6879         }
6880 }
6881 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6882
6883 #ifdef CONFIG_SYSFS
6884 static int netif_alloc_rx_queues(struct net_device *dev)
6885 {
6886         unsigned int i, count = dev->num_rx_queues;
6887         struct netdev_rx_queue *rx;
6888         size_t sz = count * sizeof(*rx);
6889
6890         BUG_ON(count < 1);
6891
6892         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6893         if (!rx) {
6894                 rx = vzalloc(sz);
6895                 if (!rx)
6896                         return -ENOMEM;
6897         }
6898         dev->_rx = rx;
6899
6900         for (i = 0; i < count; i++)
6901                 rx[i].dev = dev;
6902         return 0;
6903 }
6904 #endif
6905
6906 static void netdev_init_one_queue(struct net_device *dev,
6907                                   struct netdev_queue *queue, void *_unused)
6908 {
6909         /* Initialize queue lock */
6910         spin_lock_init(&queue->_xmit_lock);
6911         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6912         queue->xmit_lock_owner = -1;
6913         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6914         queue->dev = dev;
6915 #ifdef CONFIG_BQL
6916         dql_init(&queue->dql, HZ);
6917 #endif
6918 }
6919
6920 static void netif_free_tx_queues(struct net_device *dev)
6921 {
6922         kvfree(dev->_tx);
6923 }
6924
6925 static int netif_alloc_netdev_queues(struct net_device *dev)
6926 {
6927         unsigned int count = dev->num_tx_queues;
6928         struct netdev_queue *tx;
6929         size_t sz = count * sizeof(*tx);
6930
6931         if (count < 1 || count > 0xffff)
6932                 return -EINVAL;
6933
6934         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6935         if (!tx) {
6936                 tx = vzalloc(sz);
6937                 if (!tx)
6938                         return -ENOMEM;
6939         }
6940         dev->_tx = tx;
6941
6942         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6943         spin_lock_init(&dev->tx_global_lock);
6944
6945         return 0;
6946 }
6947
6948 void netif_tx_stop_all_queues(struct net_device *dev)
6949 {
6950         unsigned int i;
6951
6952         for (i = 0; i < dev->num_tx_queues; i++) {
6953                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6954                 netif_tx_stop_queue(txq);
6955         }
6956 }
6957 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6958
6959 /**
6960  *      register_netdevice      - register a network device
6961  *      @dev: device to register
6962  *
6963  *      Take a completed network device structure and add it to the kernel
6964  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6965  *      chain. 0 is returned on success. A negative errno code is returned
6966  *      on a failure to set up the device, or if the name is a duplicate.
6967  *
6968  *      Callers must hold the rtnl semaphore. You may want
6969  *      register_netdev() instead of this.
6970  *
6971  *      BUGS:
6972  *      The locking appears insufficient to guarantee two parallel registers
6973  *      will not get the same name.
6974  */
6975
6976 int register_netdevice(struct net_device *dev)
6977 {
6978         int ret;
6979         struct net *net = dev_net(dev);
6980
6981         BUG_ON(dev_boot_phase);
6982         ASSERT_RTNL();
6983
6984         might_sleep();
6985
6986         /* When net_device's are persistent, this will be fatal. */
6987         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6988         BUG_ON(!net);
6989
6990         spin_lock_init(&dev->addr_list_lock);
6991         netdev_set_addr_lockdep_class(dev);
6992
6993         ret = dev_get_valid_name(net, dev, dev->name);
6994         if (ret < 0)
6995                 goto out;
6996
6997         /* Init, if this function is available */
6998         if (dev->netdev_ops->ndo_init) {
6999                 ret = dev->netdev_ops->ndo_init(dev);
7000                 if (ret) {
7001                         if (ret > 0)
7002                                 ret = -EIO;
7003                         goto out;
7004                 }
7005         }
7006
7007         if (((dev->hw_features | dev->features) &
7008              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7009             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7010              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7011                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7012                 ret = -EINVAL;
7013                 goto err_uninit;
7014         }
7015
7016         ret = -EBUSY;
7017         if (!dev->ifindex)
7018                 dev->ifindex = dev_new_index(net);
7019         else if (__dev_get_by_index(net, dev->ifindex))
7020                 goto err_uninit;
7021
7022         /* Transfer changeable features to wanted_features and enable
7023          * software offloads (GSO and GRO).
7024          */
7025         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7026         dev->features |= NETIF_F_SOFT_FEATURES;
7027         dev->wanted_features = dev->features & dev->hw_features;
7028
7029         if (!(dev->flags & IFF_LOOPBACK))
7030                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7031
7032         /* If IPv4 TCP segmentation offload is supported we should also
7033          * allow the device to enable segmenting the frame with the option
7034          * of ignoring a static IP ID value.  This doesn't enable the
7035          * feature itself but allows the user to enable it later.
7036          */
7037         if (dev->hw_features & NETIF_F_TSO)
7038                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7039         if (dev->vlan_features & NETIF_F_TSO)
7040                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7041         if (dev->mpls_features & NETIF_F_TSO)
7042                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7043         if (dev->hw_enc_features & NETIF_F_TSO)
7044                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7045
7046         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7047          */
7048         dev->vlan_features |= NETIF_F_HIGHDMA;
7049
7050         /* Make NETIF_F_SG inheritable to tunnel devices.
7051          */
7052         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7053
7054         /* Make NETIF_F_SG inheritable to MPLS.
7055          */
7056         dev->mpls_features |= NETIF_F_SG;
7057
7058         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7059         ret = notifier_to_errno(ret);
7060         if (ret)
7061                 goto err_uninit;
7062
7063         ret = netdev_register_kobject(dev);
7064         if (ret)
7065                 goto err_uninit;
7066         dev->reg_state = NETREG_REGISTERED;
7067
7068         __netdev_update_features(dev);
7069
7070         /*
7071          *      Default initial state at registry is that the
7072          *      device is present.
7073          */
7074
7075         set_bit(__LINK_STATE_PRESENT, &dev->state);
7076
7077         linkwatch_init_dev(dev);
7078
7079         dev_init_scheduler(dev);
7080         dev_hold(dev);
7081         list_netdevice(dev);
7082         add_device_randomness(dev->dev_addr, dev->addr_len);
7083
7084         /* If the device has permanent device address, driver should
7085          * set dev_addr and also addr_assign_type should be set to
7086          * NET_ADDR_PERM (default value).
7087          */
7088         if (dev->addr_assign_type == NET_ADDR_PERM)
7089                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7090
7091         /* Notify protocols, that a new device appeared. */
7092         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7093         ret = notifier_to_errno(ret);
7094         if (ret) {
7095                 rollback_registered(dev);
7096                 dev->reg_state = NETREG_UNREGISTERED;
7097         }
7098         /*
7099          *      Prevent userspace races by waiting until the network
7100          *      device is fully setup before sending notifications.
7101          */
7102         if (!dev->rtnl_link_ops ||
7103             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7104                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7105
7106 out:
7107         return ret;
7108
7109 err_uninit:
7110         if (dev->netdev_ops->ndo_uninit)
7111                 dev->netdev_ops->ndo_uninit(dev);
7112         goto out;
7113 }
7114 EXPORT_SYMBOL(register_netdevice);
7115
7116 /**
7117  *      init_dummy_netdev       - init a dummy network device for NAPI
7118  *      @dev: device to init
7119  *
7120  *      This takes a network device structure and initialize the minimum
7121  *      amount of fields so it can be used to schedule NAPI polls without
7122  *      registering a full blown interface. This is to be used by drivers
7123  *      that need to tie several hardware interfaces to a single NAPI
7124  *      poll scheduler due to HW limitations.
7125  */
7126 int init_dummy_netdev(struct net_device *dev)
7127 {
7128         /* Clear everything. Note we don't initialize spinlocks
7129          * are they aren't supposed to be taken by any of the
7130          * NAPI code and this dummy netdev is supposed to be
7131          * only ever used for NAPI polls
7132          */
7133         memset(dev, 0, sizeof(struct net_device));
7134
7135         /* make sure we BUG if trying to hit standard
7136          * register/unregister code path
7137          */
7138         dev->reg_state = NETREG_DUMMY;
7139
7140         /* NAPI wants this */
7141         INIT_LIST_HEAD(&dev->napi_list);
7142
7143         /* a dummy interface is started by default */
7144         set_bit(__LINK_STATE_PRESENT, &dev->state);
7145         set_bit(__LINK_STATE_START, &dev->state);
7146
7147         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7148          * because users of this 'device' dont need to change
7149          * its refcount.
7150          */
7151
7152         return 0;
7153 }
7154 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7155
7156
7157 /**
7158  *      register_netdev - register a network device
7159  *      @dev: device to register
7160  *
7161  *      Take a completed network device structure and add it to the kernel
7162  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7163  *      chain. 0 is returned on success. A negative errno code is returned
7164  *      on a failure to set up the device, or if the name is a duplicate.
7165  *
7166  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7167  *      and expands the device name if you passed a format string to
7168  *      alloc_netdev.
7169  */
7170 int register_netdev(struct net_device *dev)
7171 {
7172         int err;
7173
7174         rtnl_lock();
7175         err = register_netdevice(dev);
7176         rtnl_unlock();
7177         return err;
7178 }
7179 EXPORT_SYMBOL(register_netdev);
7180
7181 int netdev_refcnt_read(const struct net_device *dev)
7182 {
7183         int i, refcnt = 0;
7184
7185         for_each_possible_cpu(i)
7186                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7187         return refcnt;
7188 }
7189 EXPORT_SYMBOL(netdev_refcnt_read);
7190
7191 /**
7192  * netdev_wait_allrefs - wait until all references are gone.
7193  * @dev: target net_device
7194  *
7195  * This is called when unregistering network devices.
7196  *
7197  * Any protocol or device that holds a reference should register
7198  * for netdevice notification, and cleanup and put back the
7199  * reference if they receive an UNREGISTER event.
7200  * We can get stuck here if buggy protocols don't correctly
7201  * call dev_put.
7202  */
7203 static void netdev_wait_allrefs(struct net_device *dev)
7204 {
7205         unsigned long rebroadcast_time, warning_time;
7206         int refcnt;
7207
7208         linkwatch_forget_dev(dev);
7209
7210         rebroadcast_time = warning_time = jiffies;
7211         refcnt = netdev_refcnt_read(dev);
7212
7213         while (refcnt != 0) {
7214                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7215                         rtnl_lock();
7216
7217                         /* Rebroadcast unregister notification */
7218                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7219
7220                         __rtnl_unlock();
7221                         rcu_barrier();
7222                         rtnl_lock();
7223
7224                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7225                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7226                                      &dev->state)) {
7227                                 /* We must not have linkwatch events
7228                                  * pending on unregister. If this
7229                                  * happens, we simply run the queue
7230                                  * unscheduled, resulting in a noop
7231                                  * for this device.
7232                                  */
7233                                 linkwatch_run_queue();
7234                         }
7235
7236                         __rtnl_unlock();
7237
7238                         rebroadcast_time = jiffies;
7239                 }
7240
7241                 msleep(250);
7242
7243                 refcnt = netdev_refcnt_read(dev);
7244
7245                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7246                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7247                                  dev->name, refcnt);
7248                         warning_time = jiffies;
7249                 }
7250         }
7251 }
7252
7253 /* The sequence is:
7254  *
7255  *      rtnl_lock();
7256  *      ...
7257  *      register_netdevice(x1);
7258  *      register_netdevice(x2);
7259  *      ...
7260  *      unregister_netdevice(y1);
7261  *      unregister_netdevice(y2);
7262  *      ...
7263  *      rtnl_unlock();
7264  *      free_netdev(y1);
7265  *      free_netdev(y2);
7266  *
7267  * We are invoked by rtnl_unlock().
7268  * This allows us to deal with problems:
7269  * 1) We can delete sysfs objects which invoke hotplug
7270  *    without deadlocking with linkwatch via keventd.
7271  * 2) Since we run with the RTNL semaphore not held, we can sleep
7272  *    safely in order to wait for the netdev refcnt to drop to zero.
7273  *
7274  * We must not return until all unregister events added during
7275  * the interval the lock was held have been completed.
7276  */
7277 void netdev_run_todo(void)
7278 {
7279         struct list_head list;
7280
7281         /* Snapshot list, allow later requests */
7282         list_replace_init(&net_todo_list, &list);
7283
7284         __rtnl_unlock();
7285
7286
7287         /* Wait for rcu callbacks to finish before next phase */
7288         if (!list_empty(&list))
7289                 rcu_barrier();
7290
7291         while (!list_empty(&list)) {
7292                 struct net_device *dev
7293                         = list_first_entry(&list, struct net_device, todo_list);
7294                 list_del(&dev->todo_list);
7295
7296                 rtnl_lock();
7297                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7298                 __rtnl_unlock();
7299
7300                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7301                         pr_err("network todo '%s' but state %d\n",
7302                                dev->name, dev->reg_state);
7303                         dump_stack();
7304                         continue;
7305                 }
7306
7307                 dev->reg_state = NETREG_UNREGISTERED;
7308
7309                 netdev_wait_allrefs(dev);
7310
7311                 /* paranoia */
7312                 BUG_ON(netdev_refcnt_read(dev));
7313                 BUG_ON(!list_empty(&dev->ptype_all));
7314                 BUG_ON(!list_empty(&dev->ptype_specific));
7315                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7316                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7317                 WARN_ON(dev->dn_ptr);
7318
7319                 if (dev->destructor)
7320                         dev->destructor(dev);
7321
7322                 /* Report a network device has been unregistered */
7323                 rtnl_lock();
7324                 dev_net(dev)->dev_unreg_count--;
7325                 __rtnl_unlock();
7326                 wake_up(&netdev_unregistering_wq);
7327
7328                 /* Free network device */
7329                 kobject_put(&dev->dev.kobj);
7330         }
7331 }
7332
7333 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7334  * all the same fields in the same order as net_device_stats, with only
7335  * the type differing, but rtnl_link_stats64 may have additional fields
7336  * at the end for newer counters.
7337  */
7338 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7339                              const struct net_device_stats *netdev_stats)
7340 {
7341 #if BITS_PER_LONG == 64
7342         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7343         memcpy(stats64, netdev_stats, sizeof(*stats64));
7344         /* zero out counters that only exist in rtnl_link_stats64 */
7345         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7346                sizeof(*stats64) - sizeof(*netdev_stats));
7347 #else
7348         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7349         const unsigned long *src = (const unsigned long *)netdev_stats;
7350         u64 *dst = (u64 *)stats64;
7351
7352         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7353         for (i = 0; i < n; i++)
7354                 dst[i] = src[i];
7355         /* zero out counters that only exist in rtnl_link_stats64 */
7356         memset((char *)stats64 + n * sizeof(u64), 0,
7357                sizeof(*stats64) - n * sizeof(u64));
7358 #endif
7359 }
7360 EXPORT_SYMBOL(netdev_stats_to_stats64);
7361
7362 /**
7363  *      dev_get_stats   - get network device statistics
7364  *      @dev: device to get statistics from
7365  *      @storage: place to store stats
7366  *
7367  *      Get network statistics from device. Return @storage.
7368  *      The device driver may provide its own method by setting
7369  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7370  *      otherwise the internal statistics structure is used.
7371  */
7372 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7373                                         struct rtnl_link_stats64 *storage)
7374 {
7375         const struct net_device_ops *ops = dev->netdev_ops;
7376
7377         if (ops->ndo_get_stats64) {
7378                 memset(storage, 0, sizeof(*storage));
7379                 ops->ndo_get_stats64(dev, storage);
7380         } else if (ops->ndo_get_stats) {
7381                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7382         } else {
7383                 netdev_stats_to_stats64(storage, &dev->stats);
7384         }
7385         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7386         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7387         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7388         return storage;
7389 }
7390 EXPORT_SYMBOL(dev_get_stats);
7391
7392 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7393 {
7394         struct netdev_queue *queue = dev_ingress_queue(dev);
7395
7396 #ifdef CONFIG_NET_CLS_ACT
7397         if (queue)
7398                 return queue;
7399         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7400         if (!queue)
7401                 return NULL;
7402         netdev_init_one_queue(dev, queue, NULL);
7403         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7404         queue->qdisc_sleeping = &noop_qdisc;
7405         rcu_assign_pointer(dev->ingress_queue, queue);
7406 #endif
7407         return queue;
7408 }
7409
7410 static const struct ethtool_ops default_ethtool_ops;
7411
7412 void netdev_set_default_ethtool_ops(struct net_device *dev,
7413                                     const struct ethtool_ops *ops)
7414 {
7415         if (dev->ethtool_ops == &default_ethtool_ops)
7416                 dev->ethtool_ops = ops;
7417 }
7418 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7419
7420 void netdev_freemem(struct net_device *dev)
7421 {
7422         char *addr = (char *)dev - dev->padded;
7423
7424         kvfree(addr);
7425 }
7426
7427 /**
7428  *      alloc_netdev_mqs - allocate network device
7429  *      @sizeof_priv:           size of private data to allocate space for
7430  *      @name:                  device name format string
7431  *      @name_assign_type:      origin of device name
7432  *      @setup:                 callback to initialize device
7433  *      @txqs:                  the number of TX subqueues to allocate
7434  *      @rxqs:                  the number of RX subqueues to allocate
7435  *
7436  *      Allocates a struct net_device with private data area for driver use
7437  *      and performs basic initialization.  Also allocates subqueue structs
7438  *      for each queue on the device.
7439  */
7440 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7441                 unsigned char name_assign_type,
7442                 void (*setup)(struct net_device *),
7443                 unsigned int txqs, unsigned int rxqs)
7444 {
7445         struct net_device *dev;
7446         size_t alloc_size;
7447         struct net_device *p;
7448
7449         BUG_ON(strlen(name) >= sizeof(dev->name));
7450
7451         if (txqs < 1) {
7452                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7453                 return NULL;
7454         }
7455
7456 #ifdef CONFIG_SYSFS
7457         if (rxqs < 1) {
7458                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7459                 return NULL;
7460         }
7461 #endif
7462
7463         alloc_size = sizeof(struct net_device);
7464         if (sizeof_priv) {
7465                 /* ensure 32-byte alignment of private area */
7466                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7467                 alloc_size += sizeof_priv;
7468         }
7469         /* ensure 32-byte alignment of whole construct */
7470         alloc_size += NETDEV_ALIGN - 1;
7471
7472         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7473         if (!p)
7474                 p = vzalloc(alloc_size);
7475         if (!p)
7476                 return NULL;
7477
7478         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7479         dev->padded = (char *)dev - (char *)p;
7480
7481         dev->pcpu_refcnt = alloc_percpu(int);
7482         if (!dev->pcpu_refcnt)
7483                 goto free_dev;
7484
7485         if (dev_addr_init(dev))
7486                 goto free_pcpu;
7487
7488         dev_mc_init(dev);
7489         dev_uc_init(dev);
7490
7491         dev_net_set(dev, &init_net);
7492
7493         dev->gso_max_size = GSO_MAX_SIZE;
7494         dev->gso_max_segs = GSO_MAX_SEGS;
7495
7496         INIT_LIST_HEAD(&dev->napi_list);
7497         INIT_LIST_HEAD(&dev->unreg_list);
7498         INIT_LIST_HEAD(&dev->close_list);
7499         INIT_LIST_HEAD(&dev->link_watch_list);
7500         INIT_LIST_HEAD(&dev->adj_list.upper);
7501         INIT_LIST_HEAD(&dev->adj_list.lower);
7502         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7503         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7504         INIT_LIST_HEAD(&dev->ptype_all);
7505         INIT_LIST_HEAD(&dev->ptype_specific);
7506         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7507         setup(dev);
7508
7509         if (!dev->tx_queue_len) {
7510                 dev->priv_flags |= IFF_NO_QUEUE;
7511                 dev->tx_queue_len = 1;
7512         }
7513
7514         dev->num_tx_queues = txqs;
7515         dev->real_num_tx_queues = txqs;
7516         if (netif_alloc_netdev_queues(dev))
7517                 goto free_all;
7518
7519 #ifdef CONFIG_SYSFS
7520         dev->num_rx_queues = rxqs;
7521         dev->real_num_rx_queues = rxqs;
7522         if (netif_alloc_rx_queues(dev))
7523                 goto free_all;
7524 #endif
7525
7526         strcpy(dev->name, name);
7527         dev->name_assign_type = name_assign_type;
7528         dev->group = INIT_NETDEV_GROUP;
7529         if (!dev->ethtool_ops)
7530                 dev->ethtool_ops = &default_ethtool_ops;
7531
7532         nf_hook_ingress_init(dev);
7533
7534         return dev;
7535
7536 free_all:
7537         free_netdev(dev);
7538         return NULL;
7539
7540 free_pcpu:
7541         free_percpu(dev->pcpu_refcnt);
7542 free_dev:
7543         netdev_freemem(dev);
7544         return NULL;
7545 }
7546 EXPORT_SYMBOL(alloc_netdev_mqs);
7547
7548 /**
7549  *      free_netdev - free network device
7550  *      @dev: device
7551  *
7552  *      This function does the last stage of destroying an allocated device
7553  *      interface. The reference to the device object is released.
7554  *      If this is the last reference then it will be freed.
7555  *      Must be called in process context.
7556  */
7557 void free_netdev(struct net_device *dev)
7558 {
7559         struct napi_struct *p, *n;
7560
7561         might_sleep();
7562         netif_free_tx_queues(dev);
7563 #ifdef CONFIG_SYSFS
7564         kvfree(dev->_rx);
7565 #endif
7566
7567         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7568
7569         /* Flush device addresses */
7570         dev_addr_flush(dev);
7571
7572         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7573                 netif_napi_del(p);
7574
7575         free_percpu(dev->pcpu_refcnt);
7576         dev->pcpu_refcnt = NULL;
7577
7578         /*  Compatibility with error handling in drivers */
7579         if (dev->reg_state == NETREG_UNINITIALIZED) {
7580                 netdev_freemem(dev);
7581                 return;
7582         }
7583
7584         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7585         dev->reg_state = NETREG_RELEASED;
7586
7587         /* will free via device release */
7588         put_device(&dev->dev);
7589 }
7590 EXPORT_SYMBOL(free_netdev);
7591
7592 /**
7593  *      synchronize_net -  Synchronize with packet receive processing
7594  *
7595  *      Wait for packets currently being received to be done.
7596  *      Does not block later packets from starting.
7597  */
7598 void synchronize_net(void)
7599 {
7600         might_sleep();
7601         if (rtnl_is_locked())
7602                 synchronize_rcu_expedited();
7603         else
7604                 synchronize_rcu();
7605 }
7606 EXPORT_SYMBOL(synchronize_net);
7607
7608 /**
7609  *      unregister_netdevice_queue - remove device from the kernel
7610  *      @dev: device
7611  *      @head: list
7612  *
7613  *      This function shuts down a device interface and removes it
7614  *      from the kernel tables.
7615  *      If head not NULL, device is queued to be unregistered later.
7616  *
7617  *      Callers must hold the rtnl semaphore.  You may want
7618  *      unregister_netdev() instead of this.
7619  */
7620
7621 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7622 {
7623         ASSERT_RTNL();
7624
7625         if (head) {
7626                 list_move_tail(&dev->unreg_list, head);
7627         } else {
7628                 rollback_registered(dev);
7629                 /* Finish processing unregister after unlock */
7630                 net_set_todo(dev);
7631         }
7632 }
7633 EXPORT_SYMBOL(unregister_netdevice_queue);
7634
7635 /**
7636  *      unregister_netdevice_many - unregister many devices
7637  *      @head: list of devices
7638  *
7639  *  Note: As most callers use a stack allocated list_head,
7640  *  we force a list_del() to make sure stack wont be corrupted later.
7641  */
7642 void unregister_netdevice_many(struct list_head *head)
7643 {
7644         struct net_device *dev;
7645
7646         if (!list_empty(head)) {
7647                 rollback_registered_many(head);
7648                 list_for_each_entry(dev, head, unreg_list)
7649                         net_set_todo(dev);
7650                 list_del(head);
7651         }
7652 }
7653 EXPORT_SYMBOL(unregister_netdevice_many);
7654
7655 /**
7656  *      unregister_netdev - remove device from the kernel
7657  *      @dev: device
7658  *
7659  *      This function shuts down a device interface and removes it
7660  *      from the kernel tables.
7661  *
7662  *      This is just a wrapper for unregister_netdevice that takes
7663  *      the rtnl semaphore.  In general you want to use this and not
7664  *      unregister_netdevice.
7665  */
7666 void unregister_netdev(struct net_device *dev)
7667 {
7668         rtnl_lock();
7669         unregister_netdevice(dev);
7670         rtnl_unlock();
7671 }
7672 EXPORT_SYMBOL(unregister_netdev);
7673
7674 /**
7675  *      dev_change_net_namespace - move device to different nethost namespace
7676  *      @dev: device
7677  *      @net: network namespace
7678  *      @pat: If not NULL name pattern to try if the current device name
7679  *            is already taken in the destination network namespace.
7680  *
7681  *      This function shuts down a device interface and moves it
7682  *      to a new network namespace. On success 0 is returned, on
7683  *      a failure a netagive errno code is returned.
7684  *
7685  *      Callers must hold the rtnl semaphore.
7686  */
7687
7688 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7689 {
7690         int err;
7691
7692         ASSERT_RTNL();
7693
7694         /* Don't allow namespace local devices to be moved. */
7695         err = -EINVAL;
7696         if (dev->features & NETIF_F_NETNS_LOCAL)
7697                 goto out;
7698
7699         /* Ensure the device has been registrered */
7700         if (dev->reg_state != NETREG_REGISTERED)
7701                 goto out;
7702
7703         /* Get out if there is nothing todo */
7704         err = 0;
7705         if (net_eq(dev_net(dev), net))
7706                 goto out;
7707
7708         /* Pick the destination device name, and ensure
7709          * we can use it in the destination network namespace.
7710          */
7711         err = -EEXIST;
7712         if (__dev_get_by_name(net, dev->name)) {
7713                 /* We get here if we can't use the current device name */
7714                 if (!pat)
7715                         goto out;
7716                 if (dev_get_valid_name(net, dev, pat) < 0)
7717                         goto out;
7718         }
7719
7720         /*
7721          * And now a mini version of register_netdevice unregister_netdevice.
7722          */
7723
7724         /* If device is running close it first. */
7725         dev_close(dev);
7726
7727         /* And unlink it from device chain */
7728         err = -ENODEV;
7729         unlist_netdevice(dev);
7730
7731         synchronize_net();
7732
7733         /* Shutdown queueing discipline. */
7734         dev_shutdown(dev);
7735
7736         /* Notify protocols, that we are about to destroy
7737            this device. They should clean all the things.
7738
7739            Note that dev->reg_state stays at NETREG_REGISTERED.
7740            This is wanted because this way 8021q and macvlan know
7741            the device is just moving and can keep their slaves up.
7742         */
7743         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7744         rcu_barrier();
7745         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7746         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7747
7748         /*
7749          *      Flush the unicast and multicast chains
7750          */
7751         dev_uc_flush(dev);
7752         dev_mc_flush(dev);
7753
7754         /* Send a netdev-removed uevent to the old namespace */
7755         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7756         netdev_adjacent_del_links(dev);
7757
7758         /* Actually switch the network namespace */
7759         dev_net_set(dev, net);
7760
7761         /* If there is an ifindex conflict assign a new one */
7762         if (__dev_get_by_index(net, dev->ifindex))
7763                 dev->ifindex = dev_new_index(net);
7764
7765         /* Send a netdev-add uevent to the new namespace */
7766         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7767         netdev_adjacent_add_links(dev);
7768
7769         /* Fixup kobjects */
7770         err = device_rename(&dev->dev, dev->name);
7771         WARN_ON(err);
7772
7773         /* Add the device back in the hashes */
7774         list_netdevice(dev);
7775
7776         /* Notify protocols, that a new device appeared. */
7777         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7778
7779         /*
7780          *      Prevent userspace races by waiting until the network
7781          *      device is fully setup before sending notifications.
7782          */
7783         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7784
7785         synchronize_net();
7786         err = 0;
7787 out:
7788         return err;
7789 }
7790 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7791
7792 static int dev_cpu_callback(struct notifier_block *nfb,
7793                             unsigned long action,
7794                             void *ocpu)
7795 {
7796         struct sk_buff **list_skb;
7797         struct sk_buff *skb;
7798         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7799         struct softnet_data *sd, *oldsd;
7800
7801         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7802                 return NOTIFY_OK;
7803
7804         local_irq_disable();
7805         cpu = smp_processor_id();
7806         sd = &per_cpu(softnet_data, cpu);
7807         oldsd = &per_cpu(softnet_data, oldcpu);
7808
7809         /* Find end of our completion_queue. */
7810         list_skb = &sd->completion_queue;
7811         while (*list_skb)
7812                 list_skb = &(*list_skb)->next;
7813         /* Append completion queue from offline CPU. */
7814         *list_skb = oldsd->completion_queue;
7815         oldsd->completion_queue = NULL;
7816
7817         /* Append output queue from offline CPU. */
7818         if (oldsd->output_queue) {
7819                 *sd->output_queue_tailp = oldsd->output_queue;
7820                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7821                 oldsd->output_queue = NULL;
7822                 oldsd->output_queue_tailp = &oldsd->output_queue;
7823         }
7824         /* Append NAPI poll list from offline CPU, with one exception :
7825          * process_backlog() must be called by cpu owning percpu backlog.
7826          * We properly handle process_queue & input_pkt_queue later.
7827          */
7828         while (!list_empty(&oldsd->poll_list)) {
7829                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7830                                                             struct napi_struct,
7831                                                             poll_list);
7832
7833                 list_del_init(&napi->poll_list);
7834                 if (napi->poll == process_backlog)
7835                         napi->state = 0;
7836                 else
7837                         ____napi_schedule(sd, napi);
7838         }
7839
7840         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7841         local_irq_enable();
7842
7843         /* Process offline CPU's input_pkt_queue */
7844         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7845                 netif_rx_ni(skb);
7846                 input_queue_head_incr(oldsd);
7847         }
7848         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7849                 netif_rx_ni(skb);
7850                 input_queue_head_incr(oldsd);
7851         }
7852
7853         return NOTIFY_OK;
7854 }
7855
7856
7857 /**
7858  *      netdev_increment_features - increment feature set by one
7859  *      @all: current feature set
7860  *      @one: new feature set
7861  *      @mask: mask feature set
7862  *
7863  *      Computes a new feature set after adding a device with feature set
7864  *      @one to the master device with current feature set @all.  Will not
7865  *      enable anything that is off in @mask. Returns the new feature set.
7866  */
7867 netdev_features_t netdev_increment_features(netdev_features_t all,
7868         netdev_features_t one, netdev_features_t mask)
7869 {
7870         if (mask & NETIF_F_HW_CSUM)
7871                 mask |= NETIF_F_CSUM_MASK;
7872         mask |= NETIF_F_VLAN_CHALLENGED;
7873
7874         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7875         all &= one | ~NETIF_F_ALL_FOR_ALL;
7876
7877         /* If one device supports hw checksumming, set for all. */
7878         if (all & NETIF_F_HW_CSUM)
7879                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7880
7881         return all;
7882 }
7883 EXPORT_SYMBOL(netdev_increment_features);
7884
7885 static struct hlist_head * __net_init netdev_create_hash(void)
7886 {
7887         int i;
7888         struct hlist_head *hash;
7889
7890         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7891         if (hash != NULL)
7892                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7893                         INIT_HLIST_HEAD(&hash[i]);
7894
7895         return hash;
7896 }
7897
7898 /* Initialize per network namespace state */
7899 static int __net_init netdev_init(struct net *net)
7900 {
7901         if (net != &init_net)
7902                 INIT_LIST_HEAD(&net->dev_base_head);
7903
7904         net->dev_name_head = netdev_create_hash();
7905         if (net->dev_name_head == NULL)
7906                 goto err_name;
7907
7908         net->dev_index_head = netdev_create_hash();
7909         if (net->dev_index_head == NULL)
7910                 goto err_idx;
7911
7912         return 0;
7913
7914 err_idx:
7915         kfree(net->dev_name_head);
7916 err_name:
7917         return -ENOMEM;
7918 }
7919
7920 /**
7921  *      netdev_drivername - network driver for the device
7922  *      @dev: network device
7923  *
7924  *      Determine network driver for device.
7925  */
7926 const char *netdev_drivername(const struct net_device *dev)
7927 {
7928         const struct device_driver *driver;
7929         const struct device *parent;
7930         const char *empty = "";
7931
7932         parent = dev->dev.parent;
7933         if (!parent)
7934                 return empty;
7935
7936         driver = parent->driver;
7937         if (driver && driver->name)
7938                 return driver->name;
7939         return empty;
7940 }
7941
7942 static void __netdev_printk(const char *level, const struct net_device *dev,
7943                             struct va_format *vaf)
7944 {
7945         if (dev && dev->dev.parent) {
7946                 dev_printk_emit(level[1] - '0',
7947                                 dev->dev.parent,
7948                                 "%s %s %s%s: %pV",
7949                                 dev_driver_string(dev->dev.parent),
7950                                 dev_name(dev->dev.parent),
7951                                 netdev_name(dev), netdev_reg_state(dev),
7952                                 vaf);
7953         } else if (dev) {
7954                 printk("%s%s%s: %pV",
7955                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7956         } else {
7957                 printk("%s(NULL net_device): %pV", level, vaf);
7958         }
7959 }
7960
7961 void netdev_printk(const char *level, const struct net_device *dev,
7962                    const char *format, ...)
7963 {
7964         struct va_format vaf;
7965         va_list args;
7966
7967         va_start(args, format);
7968
7969         vaf.fmt = format;
7970         vaf.va = &args;
7971
7972         __netdev_printk(level, dev, &vaf);
7973
7974         va_end(args);
7975 }
7976 EXPORT_SYMBOL(netdev_printk);
7977
7978 #define define_netdev_printk_level(func, level)                 \
7979 void func(const struct net_device *dev, const char *fmt, ...)   \
7980 {                                                               \
7981         struct va_format vaf;                                   \
7982         va_list args;                                           \
7983                                                                 \
7984         va_start(args, fmt);                                    \
7985                                                                 \
7986         vaf.fmt = fmt;                                          \
7987         vaf.va = &args;                                         \
7988                                                                 \
7989         __netdev_printk(level, dev, &vaf);                      \
7990                                                                 \
7991         va_end(args);                                           \
7992 }                                                               \
7993 EXPORT_SYMBOL(func);
7994
7995 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7996 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7997 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7998 define_netdev_printk_level(netdev_err, KERN_ERR);
7999 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8000 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8001 define_netdev_printk_level(netdev_info, KERN_INFO);
8002
8003 static void __net_exit netdev_exit(struct net *net)
8004 {
8005         kfree(net->dev_name_head);
8006         kfree(net->dev_index_head);
8007 }
8008
8009 static struct pernet_operations __net_initdata netdev_net_ops = {
8010         .init = netdev_init,
8011         .exit = netdev_exit,
8012 };
8013
8014 static void __net_exit default_device_exit(struct net *net)
8015 {
8016         struct net_device *dev, *aux;
8017         /*
8018          * Push all migratable network devices back to the
8019          * initial network namespace
8020          */
8021         rtnl_lock();
8022         for_each_netdev_safe(net, dev, aux) {
8023                 int err;
8024                 char fb_name[IFNAMSIZ];
8025
8026                 /* Ignore unmoveable devices (i.e. loopback) */
8027                 if (dev->features & NETIF_F_NETNS_LOCAL)
8028                         continue;
8029
8030                 /* Leave virtual devices for the generic cleanup */
8031                 if (dev->rtnl_link_ops)
8032                         continue;
8033
8034                 /* Push remaining network devices to init_net */
8035                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8036                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8037                 if (err) {
8038                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8039                                  __func__, dev->name, err);
8040                         BUG();
8041                 }
8042         }
8043         rtnl_unlock();
8044 }
8045
8046 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8047 {
8048         /* Return with the rtnl_lock held when there are no network
8049          * devices unregistering in any network namespace in net_list.
8050          */
8051         struct net *net;
8052         bool unregistering;
8053         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8054
8055         add_wait_queue(&netdev_unregistering_wq, &wait);
8056         for (;;) {
8057                 unregistering = false;
8058                 rtnl_lock();
8059                 list_for_each_entry(net, net_list, exit_list) {
8060                         if (net->dev_unreg_count > 0) {
8061                                 unregistering = true;
8062                                 break;
8063                         }
8064                 }
8065                 if (!unregistering)
8066                         break;
8067                 __rtnl_unlock();
8068
8069                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8070         }
8071         remove_wait_queue(&netdev_unregistering_wq, &wait);
8072 }
8073
8074 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8075 {
8076         /* At exit all network devices most be removed from a network
8077          * namespace.  Do this in the reverse order of registration.
8078          * Do this across as many network namespaces as possible to
8079          * improve batching efficiency.
8080          */
8081         struct net_device *dev;
8082         struct net *net;
8083         LIST_HEAD(dev_kill_list);
8084
8085         /* To prevent network device cleanup code from dereferencing
8086          * loopback devices or network devices that have been freed
8087          * wait here for all pending unregistrations to complete,
8088          * before unregistring the loopback device and allowing the
8089          * network namespace be freed.
8090          *
8091          * The netdev todo list containing all network devices
8092          * unregistrations that happen in default_device_exit_batch
8093          * will run in the rtnl_unlock() at the end of
8094          * default_device_exit_batch.
8095          */
8096         rtnl_lock_unregistering(net_list);
8097         list_for_each_entry(net, net_list, exit_list) {
8098                 for_each_netdev_reverse(net, dev) {
8099                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8100                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8101                         else
8102                                 unregister_netdevice_queue(dev, &dev_kill_list);
8103                 }
8104         }
8105         unregister_netdevice_many(&dev_kill_list);
8106         rtnl_unlock();
8107 }
8108
8109 static struct pernet_operations __net_initdata default_device_ops = {
8110         .exit = default_device_exit,
8111         .exit_batch = default_device_exit_batch,
8112 };
8113
8114 /*
8115  *      Initialize the DEV module. At boot time this walks the device list and
8116  *      unhooks any devices that fail to initialise (normally hardware not
8117  *      present) and leaves us with a valid list of present and active devices.
8118  *
8119  */
8120
8121 /*
8122  *       This is called single threaded during boot, so no need
8123  *       to take the rtnl semaphore.
8124  */
8125 static int __init net_dev_init(void)
8126 {
8127         int i, rc = -ENOMEM;
8128
8129         BUG_ON(!dev_boot_phase);
8130
8131         if (dev_proc_init())
8132                 goto out;
8133
8134         if (netdev_kobject_init())
8135                 goto out;
8136
8137         INIT_LIST_HEAD(&ptype_all);
8138         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8139                 INIT_LIST_HEAD(&ptype_base[i]);
8140
8141         INIT_LIST_HEAD(&offload_base);
8142
8143         if (register_pernet_subsys(&netdev_net_ops))
8144                 goto out;
8145
8146         /*
8147          *      Initialise the packet receive queues.
8148          */
8149
8150         for_each_possible_cpu(i) {
8151                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8152
8153                 skb_queue_head_init(&sd->input_pkt_queue);
8154                 skb_queue_head_init(&sd->process_queue);
8155                 INIT_LIST_HEAD(&sd->poll_list);
8156                 sd->output_queue_tailp = &sd->output_queue;
8157 #ifdef CONFIG_RPS
8158                 sd->csd.func = rps_trigger_softirq;
8159                 sd->csd.info = sd;
8160                 sd->cpu = i;
8161 #endif
8162
8163                 sd->backlog.poll = process_backlog;
8164                 sd->backlog.weight = weight_p;
8165         }
8166
8167         dev_boot_phase = 0;
8168
8169         /* The loopback device is special if any other network devices
8170          * is present in a network namespace the loopback device must
8171          * be present. Since we now dynamically allocate and free the
8172          * loopback device ensure this invariant is maintained by
8173          * keeping the loopback device as the first device on the
8174          * list of network devices.  Ensuring the loopback devices
8175          * is the first device that appears and the last network device
8176          * that disappears.
8177          */
8178         if (register_pernet_device(&loopback_net_ops))
8179                 goto out;
8180
8181         if (register_pernet_device(&default_device_ops))
8182                 goto out;
8183
8184         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8185         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8186
8187         hotcpu_notifier(dev_cpu_callback, 0);
8188         dst_subsys_init();
8189         rc = 0;
8190 out:
8191         return rc;
8192 }
8193
8194 subsys_initcall(net_dev_init);