]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/core/dev.c
ip6gre: Add support for basic offloads offloads excluding GSO
[karo-tx-linux.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <net/busy_poll.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/stat.h>
102 #include <net/dst.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
119 #include <linux/if_arp.h>
120 #include <linux/if_vlan.h>
121 #include <linux/ip.h>
122 #include <net/ip.h>
123 #include <net/mpls.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141 #include <linux/sctp.h>
142
143 #include "net-sysfs.h"
144
145 /* Instead of increasing this, you should create a hash table. */
146 #define MAX_GRO_SKBS 8
147
148 /* This should be increased if a protocol with a bigger head is added. */
149 #define GRO_MAX_HEAD (MAX_HEADER + 128)
150
151 static DEFINE_SPINLOCK(ptype_lock);
152 static DEFINE_SPINLOCK(offload_lock);
153 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
154 struct list_head ptype_all __read_mostly;       /* Taps */
155 static struct list_head offload_base __read_mostly;
156
157 static int netif_rx_internal(struct sk_buff *skb);
158 static int call_netdevice_notifiers_info(unsigned long val,
159                                          struct net_device *dev,
160                                          struct netdev_notifier_info *info);
161
162 /*
163  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
164  * semaphore.
165  *
166  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
167  *
168  * Writers must hold the rtnl semaphore while they loop through the
169  * dev_base_head list, and hold dev_base_lock for writing when they do the
170  * actual updates.  This allows pure readers to access the list even
171  * while a writer is preparing to update it.
172  *
173  * To put it another way, dev_base_lock is held for writing only to
174  * protect against pure readers; the rtnl semaphore provides the
175  * protection against other writers.
176  *
177  * See, for example usages, register_netdevice() and
178  * unregister_netdevice(), which must be called with the rtnl
179  * semaphore held.
180  */
181 DEFINE_RWLOCK(dev_base_lock);
182 EXPORT_SYMBOL(dev_base_lock);
183
184 /* protects napi_hash addition/deletion and napi_gen_id */
185 static DEFINE_SPINLOCK(napi_hash_lock);
186
187 static unsigned int napi_gen_id = NR_CPUS;
188 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
189
190 static seqcount_t devnet_rename_seq;
191
192 static inline void dev_base_seq_inc(struct net *net)
193 {
194         while (++net->dev_base_seq == 0);
195 }
196
197 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
198 {
199         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
200
201         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
202 }
203
204 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
205 {
206         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
207 }
208
209 static inline void rps_lock(struct softnet_data *sd)
210 {
211 #ifdef CONFIG_RPS
212         spin_lock(&sd->input_pkt_queue.lock);
213 #endif
214 }
215
216 static inline void rps_unlock(struct softnet_data *sd)
217 {
218 #ifdef CONFIG_RPS
219         spin_unlock(&sd->input_pkt_queue.lock);
220 #endif
221 }
222
223 /* Device list insertion */
224 static void list_netdevice(struct net_device *dev)
225 {
226         struct net *net = dev_net(dev);
227
228         ASSERT_RTNL();
229
230         write_lock_bh(&dev_base_lock);
231         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
232         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
233         hlist_add_head_rcu(&dev->index_hlist,
234                            dev_index_hash(net, dev->ifindex));
235         write_unlock_bh(&dev_base_lock);
236
237         dev_base_seq_inc(net);
238 }
239
240 /* Device list removal
241  * caller must respect a RCU grace period before freeing/reusing dev
242  */
243 static void unlist_netdevice(struct net_device *dev)
244 {
245         ASSERT_RTNL();
246
247         /* Unlink dev from the device chain */
248         write_lock_bh(&dev_base_lock);
249         list_del_rcu(&dev->dev_list);
250         hlist_del_rcu(&dev->name_hlist);
251         hlist_del_rcu(&dev->index_hlist);
252         write_unlock_bh(&dev_base_lock);
253
254         dev_base_seq_inc(dev_net(dev));
255 }
256
257 /*
258  *      Our notifier list
259  */
260
261 static RAW_NOTIFIER_HEAD(netdev_chain);
262
263 /*
264  *      Device drivers call our routines to queue packets here. We empty the
265  *      queue in the local softnet handler.
266  */
267
268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
269 EXPORT_PER_CPU_SYMBOL(softnet_data);
270
271 #ifdef CONFIG_LOCKDEP
272 /*
273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
274  * according to dev->type
275  */
276 static const unsigned short netdev_lock_type[] =
277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
290          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
291          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
292
293 static const char *const netdev_lock_name[] =
294         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
307          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
308          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
309
310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
312
313 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
314 {
315         int i;
316
317         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
318                 if (netdev_lock_type[i] == dev_type)
319                         return i;
320         /* the last key is used by default */
321         return ARRAY_SIZE(netdev_lock_type) - 1;
322 }
323
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325                                                  unsigned short dev_type)
326 {
327         int i;
328
329         i = netdev_lock_pos(dev_type);
330         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
331                                    netdev_lock_name[i]);
332 }
333
334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
335 {
336         int i;
337
338         i = netdev_lock_pos(dev->type);
339         lockdep_set_class_and_name(&dev->addr_list_lock,
340                                    &netdev_addr_lock_key[i],
341                                    netdev_lock_name[i]);
342 }
343 #else
344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
345                                                  unsigned short dev_type)
346 {
347 }
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 }
351 #endif
352
353 /*******************************************************************************
354
355                 Protocol management and registration routines
356
357 *******************************************************************************/
358
359 /*
360  *      Add a protocol ID to the list. Now that the input handler is
361  *      smarter we can dispense with all the messy stuff that used to be
362  *      here.
363  *
364  *      BEWARE!!! Protocol handlers, mangling input packets,
365  *      MUST BE last in hash buckets and checking protocol handlers
366  *      MUST start from promiscuous ptype_all chain in net_bh.
367  *      It is true now, do not change it.
368  *      Explanation follows: if protocol handler, mangling packet, will
369  *      be the first on list, it is not able to sense, that packet
370  *      is cloned and should be copied-on-write, so that it will
371  *      change it and subsequent readers will get broken packet.
372  *                                                      --ANK (980803)
373  */
374
375 static inline struct list_head *ptype_head(const struct packet_type *pt)
376 {
377         if (pt->type == htons(ETH_P_ALL))
378                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
379         else
380                 return pt->dev ? &pt->dev->ptype_specific :
381                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
382 }
383
384 /**
385  *      dev_add_pack - add packet handler
386  *      @pt: packet type declaration
387  *
388  *      Add a protocol handler to the networking stack. The passed &packet_type
389  *      is linked into kernel lists and may not be freed until it has been
390  *      removed from the kernel lists.
391  *
392  *      This call does not sleep therefore it can not
393  *      guarantee all CPU's that are in middle of receiving packets
394  *      will see the new packet type (until the next received packet).
395  */
396
397 void dev_add_pack(struct packet_type *pt)
398 {
399         struct list_head *head = ptype_head(pt);
400
401         spin_lock(&ptype_lock);
402         list_add_rcu(&pt->list, head);
403         spin_unlock(&ptype_lock);
404 }
405 EXPORT_SYMBOL(dev_add_pack);
406
407 /**
408  *      __dev_remove_pack        - remove packet handler
409  *      @pt: packet type declaration
410  *
411  *      Remove a protocol handler that was previously added to the kernel
412  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
413  *      from the kernel lists and can be freed or reused once this function
414  *      returns.
415  *
416  *      The packet type might still be in use by receivers
417  *      and must not be freed until after all the CPU's have gone
418  *      through a quiescent state.
419  */
420 void __dev_remove_pack(struct packet_type *pt)
421 {
422         struct list_head *head = ptype_head(pt);
423         struct packet_type *pt1;
424
425         spin_lock(&ptype_lock);
426
427         list_for_each_entry(pt1, head, list) {
428                 if (pt == pt1) {
429                         list_del_rcu(&pt->list);
430                         goto out;
431                 }
432         }
433
434         pr_warn("dev_remove_pack: %p not found\n", pt);
435 out:
436         spin_unlock(&ptype_lock);
437 }
438 EXPORT_SYMBOL(__dev_remove_pack);
439
440 /**
441  *      dev_remove_pack  - remove packet handler
442  *      @pt: packet type declaration
443  *
444  *      Remove a protocol handler that was previously added to the kernel
445  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
446  *      from the kernel lists and can be freed or reused once this function
447  *      returns.
448  *
449  *      This call sleeps to guarantee that no CPU is looking at the packet
450  *      type after return.
451  */
452 void dev_remove_pack(struct packet_type *pt)
453 {
454         __dev_remove_pack(pt);
455
456         synchronize_net();
457 }
458 EXPORT_SYMBOL(dev_remove_pack);
459
460
461 /**
462  *      dev_add_offload - register offload handlers
463  *      @po: protocol offload declaration
464  *
465  *      Add protocol offload handlers to the networking stack. The passed
466  *      &proto_offload is linked into kernel lists and may not be freed until
467  *      it has been removed from the kernel lists.
468  *
469  *      This call does not sleep therefore it can not
470  *      guarantee all CPU's that are in middle of receiving packets
471  *      will see the new offload handlers (until the next received packet).
472  */
473 void dev_add_offload(struct packet_offload *po)
474 {
475         struct packet_offload *elem;
476
477         spin_lock(&offload_lock);
478         list_for_each_entry(elem, &offload_base, list) {
479                 if (po->priority < elem->priority)
480                         break;
481         }
482         list_add_rcu(&po->list, elem->list.prev);
483         spin_unlock(&offload_lock);
484 }
485 EXPORT_SYMBOL(dev_add_offload);
486
487 /**
488  *      __dev_remove_offload     - remove offload handler
489  *      @po: packet offload declaration
490  *
491  *      Remove a protocol offload handler that was previously added to the
492  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
493  *      is removed from the kernel lists and can be freed or reused once this
494  *      function returns.
495  *
496  *      The packet type might still be in use by receivers
497  *      and must not be freed until after all the CPU's have gone
498  *      through a quiescent state.
499  */
500 static void __dev_remove_offload(struct packet_offload *po)
501 {
502         struct list_head *head = &offload_base;
503         struct packet_offload *po1;
504
505         spin_lock(&offload_lock);
506
507         list_for_each_entry(po1, head, list) {
508                 if (po == po1) {
509                         list_del_rcu(&po->list);
510                         goto out;
511                 }
512         }
513
514         pr_warn("dev_remove_offload: %p not found\n", po);
515 out:
516         spin_unlock(&offload_lock);
517 }
518
519 /**
520  *      dev_remove_offload       - remove packet offload handler
521  *      @po: packet offload declaration
522  *
523  *      Remove a packet offload handler that was previously added to the kernel
524  *      offload handlers by dev_add_offload(). The passed &offload_type is
525  *      removed from the kernel lists and can be freed or reused once this
526  *      function returns.
527  *
528  *      This call sleeps to guarantee that no CPU is looking at the packet
529  *      type after return.
530  */
531 void dev_remove_offload(struct packet_offload *po)
532 {
533         __dev_remove_offload(po);
534
535         synchronize_net();
536 }
537 EXPORT_SYMBOL(dev_remove_offload);
538
539 /******************************************************************************
540
541                       Device Boot-time Settings Routines
542
543 *******************************************************************************/
544
545 /* Boot time configuration table */
546 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
547
548 /**
549  *      netdev_boot_setup_add   - add new setup entry
550  *      @name: name of the device
551  *      @map: configured settings for the device
552  *
553  *      Adds new setup entry to the dev_boot_setup list.  The function
554  *      returns 0 on error and 1 on success.  This is a generic routine to
555  *      all netdevices.
556  */
557 static int netdev_boot_setup_add(char *name, struct ifmap *map)
558 {
559         struct netdev_boot_setup *s;
560         int i;
561
562         s = dev_boot_setup;
563         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
564                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
565                         memset(s[i].name, 0, sizeof(s[i].name));
566                         strlcpy(s[i].name, name, IFNAMSIZ);
567                         memcpy(&s[i].map, map, sizeof(s[i].map));
568                         break;
569                 }
570         }
571
572         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
573 }
574
575 /**
576  *      netdev_boot_setup_check - check boot time settings
577  *      @dev: the netdevice
578  *
579  *      Check boot time settings for the device.
580  *      The found settings are set for the device to be used
581  *      later in the device probing.
582  *      Returns 0 if no settings found, 1 if they are.
583  */
584 int netdev_boot_setup_check(struct net_device *dev)
585 {
586         struct netdev_boot_setup *s = dev_boot_setup;
587         int i;
588
589         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
590                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
591                     !strcmp(dev->name, s[i].name)) {
592                         dev->irq        = s[i].map.irq;
593                         dev->base_addr  = s[i].map.base_addr;
594                         dev->mem_start  = s[i].map.mem_start;
595                         dev->mem_end    = s[i].map.mem_end;
596                         return 1;
597                 }
598         }
599         return 0;
600 }
601 EXPORT_SYMBOL(netdev_boot_setup_check);
602
603
604 /**
605  *      netdev_boot_base        - get address from boot time settings
606  *      @prefix: prefix for network device
607  *      @unit: id for network device
608  *
609  *      Check boot time settings for the base address of device.
610  *      The found settings are set for the device to be used
611  *      later in the device probing.
612  *      Returns 0 if no settings found.
613  */
614 unsigned long netdev_boot_base(const char *prefix, int unit)
615 {
616         const struct netdev_boot_setup *s = dev_boot_setup;
617         char name[IFNAMSIZ];
618         int i;
619
620         sprintf(name, "%s%d", prefix, unit);
621
622         /*
623          * If device already registered then return base of 1
624          * to indicate not to probe for this interface
625          */
626         if (__dev_get_by_name(&init_net, name))
627                 return 1;
628
629         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
630                 if (!strcmp(name, s[i].name))
631                         return s[i].map.base_addr;
632         return 0;
633 }
634
635 /*
636  * Saves at boot time configured settings for any netdevice.
637  */
638 int __init netdev_boot_setup(char *str)
639 {
640         int ints[5];
641         struct ifmap map;
642
643         str = get_options(str, ARRAY_SIZE(ints), ints);
644         if (!str || !*str)
645                 return 0;
646
647         /* Save settings */
648         memset(&map, 0, sizeof(map));
649         if (ints[0] > 0)
650                 map.irq = ints[1];
651         if (ints[0] > 1)
652                 map.base_addr = ints[2];
653         if (ints[0] > 2)
654                 map.mem_start = ints[3];
655         if (ints[0] > 3)
656                 map.mem_end = ints[4];
657
658         /* Add new entry to the list */
659         return netdev_boot_setup_add(str, &map);
660 }
661
662 __setup("netdev=", netdev_boot_setup);
663
664 /*******************************************************************************
665
666                             Device Interface Subroutines
667
668 *******************************************************************************/
669
670 /**
671  *      dev_get_iflink  - get 'iflink' value of a interface
672  *      @dev: targeted interface
673  *
674  *      Indicates the ifindex the interface is linked to.
675  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
676  */
677
678 int dev_get_iflink(const struct net_device *dev)
679 {
680         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
681                 return dev->netdev_ops->ndo_get_iflink(dev);
682
683         return dev->ifindex;
684 }
685 EXPORT_SYMBOL(dev_get_iflink);
686
687 /**
688  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
689  *      @dev: targeted interface
690  *      @skb: The packet.
691  *
692  *      For better visibility of tunnel traffic OVS needs to retrieve
693  *      egress tunnel information for a packet. Following API allows
694  *      user to get this info.
695  */
696 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
697 {
698         struct ip_tunnel_info *info;
699
700         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
701                 return -EINVAL;
702
703         info = skb_tunnel_info_unclone(skb);
704         if (!info)
705                 return -ENOMEM;
706         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
707                 return -EINVAL;
708
709         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
710 }
711 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
712
713 /**
714  *      __dev_get_by_name       - find a device by its name
715  *      @net: the applicable net namespace
716  *      @name: name to find
717  *
718  *      Find an interface by name. Must be called under RTNL semaphore
719  *      or @dev_base_lock. If the name is found a pointer to the device
720  *      is returned. If the name is not found then %NULL is returned. The
721  *      reference counters are not incremented so the caller must be
722  *      careful with locks.
723  */
724
725 struct net_device *__dev_get_by_name(struct net *net, const char *name)
726 {
727         struct net_device *dev;
728         struct hlist_head *head = dev_name_hash(net, name);
729
730         hlist_for_each_entry(dev, head, name_hlist)
731                 if (!strncmp(dev->name, name, IFNAMSIZ))
732                         return dev;
733
734         return NULL;
735 }
736 EXPORT_SYMBOL(__dev_get_by_name);
737
738 /**
739  *      dev_get_by_name_rcu     - find a device by its name
740  *      @net: the applicable net namespace
741  *      @name: name to find
742  *
743  *      Find an interface by name.
744  *      If the name is found a pointer to the device is returned.
745  *      If the name is not found then %NULL is returned.
746  *      The reference counters are not incremented so the caller must be
747  *      careful with locks. The caller must hold RCU lock.
748  */
749
750 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
751 {
752         struct net_device *dev;
753         struct hlist_head *head = dev_name_hash(net, name);
754
755         hlist_for_each_entry_rcu(dev, head, name_hlist)
756                 if (!strncmp(dev->name, name, IFNAMSIZ))
757                         return dev;
758
759         return NULL;
760 }
761 EXPORT_SYMBOL(dev_get_by_name_rcu);
762
763 /**
764  *      dev_get_by_name         - find a device by its name
765  *      @net: the applicable net namespace
766  *      @name: name to find
767  *
768  *      Find an interface by name. This can be called from any
769  *      context and does its own locking. The returned handle has
770  *      the usage count incremented and the caller must use dev_put() to
771  *      release it when it is no longer needed. %NULL is returned if no
772  *      matching device is found.
773  */
774
775 struct net_device *dev_get_by_name(struct net *net, const char *name)
776 {
777         struct net_device *dev;
778
779         rcu_read_lock();
780         dev = dev_get_by_name_rcu(net, name);
781         if (dev)
782                 dev_hold(dev);
783         rcu_read_unlock();
784         return dev;
785 }
786 EXPORT_SYMBOL(dev_get_by_name);
787
788 /**
789  *      __dev_get_by_index - find a device by its ifindex
790  *      @net: the applicable net namespace
791  *      @ifindex: index of device
792  *
793  *      Search for an interface by index. Returns %NULL if the device
794  *      is not found or a pointer to the device. The device has not
795  *      had its reference counter increased so the caller must be careful
796  *      about locking. The caller must hold either the RTNL semaphore
797  *      or @dev_base_lock.
798  */
799
800 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
801 {
802         struct net_device *dev;
803         struct hlist_head *head = dev_index_hash(net, ifindex);
804
805         hlist_for_each_entry(dev, head, index_hlist)
806                 if (dev->ifindex == ifindex)
807                         return dev;
808
809         return NULL;
810 }
811 EXPORT_SYMBOL(__dev_get_by_index);
812
813 /**
814  *      dev_get_by_index_rcu - find a device by its ifindex
815  *      @net: the applicable net namespace
816  *      @ifindex: index of device
817  *
818  *      Search for an interface by index. Returns %NULL if the device
819  *      is not found or a pointer to the device. The device has not
820  *      had its reference counter increased so the caller must be careful
821  *      about locking. The caller must hold RCU lock.
822  */
823
824 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
825 {
826         struct net_device *dev;
827         struct hlist_head *head = dev_index_hash(net, ifindex);
828
829         hlist_for_each_entry_rcu(dev, head, index_hlist)
830                 if (dev->ifindex == ifindex)
831                         return dev;
832
833         return NULL;
834 }
835 EXPORT_SYMBOL(dev_get_by_index_rcu);
836
837
838 /**
839  *      dev_get_by_index - find a device by its ifindex
840  *      @net: the applicable net namespace
841  *      @ifindex: index of device
842  *
843  *      Search for an interface by index. Returns NULL if the device
844  *      is not found or a pointer to the device. The device returned has
845  *      had a reference added and the pointer is safe until the user calls
846  *      dev_put to indicate they have finished with it.
847  */
848
849 struct net_device *dev_get_by_index(struct net *net, int ifindex)
850 {
851         struct net_device *dev;
852
853         rcu_read_lock();
854         dev = dev_get_by_index_rcu(net, ifindex);
855         if (dev)
856                 dev_hold(dev);
857         rcu_read_unlock();
858         return dev;
859 }
860 EXPORT_SYMBOL(dev_get_by_index);
861
862 /**
863  *      netdev_get_name - get a netdevice name, knowing its ifindex.
864  *      @net: network namespace
865  *      @name: a pointer to the buffer where the name will be stored.
866  *      @ifindex: the ifindex of the interface to get the name from.
867  *
868  *      The use of raw_seqcount_begin() and cond_resched() before
869  *      retrying is required as we want to give the writers a chance
870  *      to complete when CONFIG_PREEMPT is not set.
871  */
872 int netdev_get_name(struct net *net, char *name, int ifindex)
873 {
874         struct net_device *dev;
875         unsigned int seq;
876
877 retry:
878         seq = raw_seqcount_begin(&devnet_rename_seq);
879         rcu_read_lock();
880         dev = dev_get_by_index_rcu(net, ifindex);
881         if (!dev) {
882                 rcu_read_unlock();
883                 return -ENODEV;
884         }
885
886         strcpy(name, dev->name);
887         rcu_read_unlock();
888         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
889                 cond_resched();
890                 goto retry;
891         }
892
893         return 0;
894 }
895
896 /**
897  *      dev_getbyhwaddr_rcu - find a device by its hardware address
898  *      @net: the applicable net namespace
899  *      @type: media type of device
900  *      @ha: hardware address
901  *
902  *      Search for an interface by MAC address. Returns NULL if the device
903  *      is not found or a pointer to the device.
904  *      The caller must hold RCU or RTNL.
905  *      The returned device has not had its ref count increased
906  *      and the caller must therefore be careful about locking
907  *
908  */
909
910 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
911                                        const char *ha)
912 {
913         struct net_device *dev;
914
915         for_each_netdev_rcu(net, dev)
916                 if (dev->type == type &&
917                     !memcmp(dev->dev_addr, ha, dev->addr_len))
918                         return dev;
919
920         return NULL;
921 }
922 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
923
924 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
925 {
926         struct net_device *dev;
927
928         ASSERT_RTNL();
929         for_each_netdev(net, dev)
930                 if (dev->type == type)
931                         return dev;
932
933         return NULL;
934 }
935 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
936
937 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
938 {
939         struct net_device *dev, *ret = NULL;
940
941         rcu_read_lock();
942         for_each_netdev_rcu(net, dev)
943                 if (dev->type == type) {
944                         dev_hold(dev);
945                         ret = dev;
946                         break;
947                 }
948         rcu_read_unlock();
949         return ret;
950 }
951 EXPORT_SYMBOL(dev_getfirstbyhwtype);
952
953 /**
954  *      __dev_get_by_flags - find any device with given flags
955  *      @net: the applicable net namespace
956  *      @if_flags: IFF_* values
957  *      @mask: bitmask of bits in if_flags to check
958  *
959  *      Search for any interface with the given flags. Returns NULL if a device
960  *      is not found or a pointer to the device. Must be called inside
961  *      rtnl_lock(), and result refcount is unchanged.
962  */
963
964 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
965                                       unsigned short mask)
966 {
967         struct net_device *dev, *ret;
968
969         ASSERT_RTNL();
970
971         ret = NULL;
972         for_each_netdev(net, dev) {
973                 if (((dev->flags ^ if_flags) & mask) == 0) {
974                         ret = dev;
975                         break;
976                 }
977         }
978         return ret;
979 }
980 EXPORT_SYMBOL(__dev_get_by_flags);
981
982 /**
983  *      dev_valid_name - check if name is okay for network device
984  *      @name: name string
985  *
986  *      Network device names need to be valid file names to
987  *      to allow sysfs to work.  We also disallow any kind of
988  *      whitespace.
989  */
990 bool dev_valid_name(const char *name)
991 {
992         if (*name == '\0')
993                 return false;
994         if (strlen(name) >= IFNAMSIZ)
995                 return false;
996         if (!strcmp(name, ".") || !strcmp(name, ".."))
997                 return false;
998
999         while (*name) {
1000                 if (*name == '/' || *name == ':' || isspace(*name))
1001                         return false;
1002                 name++;
1003         }
1004         return true;
1005 }
1006 EXPORT_SYMBOL(dev_valid_name);
1007
1008 /**
1009  *      __dev_alloc_name - allocate a name for a device
1010  *      @net: network namespace to allocate the device name in
1011  *      @name: name format string
1012  *      @buf:  scratch buffer and result name string
1013  *
1014  *      Passed a format string - eg "lt%d" it will try and find a suitable
1015  *      id. It scans list of devices to build up a free map, then chooses
1016  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1017  *      while allocating the name and adding the device in order to avoid
1018  *      duplicates.
1019  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020  *      Returns the number of the unit assigned or a negative errno code.
1021  */
1022
1023 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024 {
1025         int i = 0;
1026         const char *p;
1027         const int max_netdevices = 8*PAGE_SIZE;
1028         unsigned long *inuse;
1029         struct net_device *d;
1030
1031         p = strnchr(name, IFNAMSIZ-1, '%');
1032         if (p) {
1033                 /*
1034                  * Verify the string as this thing may have come from
1035                  * the user.  There must be either one "%d" and no other "%"
1036                  * characters.
1037                  */
1038                 if (p[1] != 'd' || strchr(p + 2, '%'))
1039                         return -EINVAL;
1040
1041                 /* Use one page as a bit array of possible slots */
1042                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043                 if (!inuse)
1044                         return -ENOMEM;
1045
1046                 for_each_netdev(net, d) {
1047                         if (!sscanf(d->name, name, &i))
1048                                 continue;
1049                         if (i < 0 || i >= max_netdevices)
1050                                 continue;
1051
1052                         /*  avoid cases where sscanf is not exact inverse of printf */
1053                         snprintf(buf, IFNAMSIZ, name, i);
1054                         if (!strncmp(buf, d->name, IFNAMSIZ))
1055                                 set_bit(i, inuse);
1056                 }
1057
1058                 i = find_first_zero_bit(inuse, max_netdevices);
1059                 free_page((unsigned long) inuse);
1060         }
1061
1062         if (buf != name)
1063                 snprintf(buf, IFNAMSIZ, name, i);
1064         if (!__dev_get_by_name(net, buf))
1065                 return i;
1066
1067         /* It is possible to run out of possible slots
1068          * when the name is long and there isn't enough space left
1069          * for the digits, or if all bits are used.
1070          */
1071         return -ENFILE;
1072 }
1073
1074 /**
1075  *      dev_alloc_name - allocate a name for a device
1076  *      @dev: device
1077  *      @name: name format string
1078  *
1079  *      Passed a format string - eg "lt%d" it will try and find a suitable
1080  *      id. It scans list of devices to build up a free map, then chooses
1081  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1082  *      while allocating the name and adding the device in order to avoid
1083  *      duplicates.
1084  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085  *      Returns the number of the unit assigned or a negative errno code.
1086  */
1087
1088 int dev_alloc_name(struct net_device *dev, const char *name)
1089 {
1090         char buf[IFNAMSIZ];
1091         struct net *net;
1092         int ret;
1093
1094         BUG_ON(!dev_net(dev));
1095         net = dev_net(dev);
1096         ret = __dev_alloc_name(net, name, buf);
1097         if (ret >= 0)
1098                 strlcpy(dev->name, buf, IFNAMSIZ);
1099         return ret;
1100 }
1101 EXPORT_SYMBOL(dev_alloc_name);
1102
1103 static int dev_alloc_name_ns(struct net *net,
1104                              struct net_device *dev,
1105                              const char *name)
1106 {
1107         char buf[IFNAMSIZ];
1108         int ret;
1109
1110         ret = __dev_alloc_name(net, name, buf);
1111         if (ret >= 0)
1112                 strlcpy(dev->name, buf, IFNAMSIZ);
1113         return ret;
1114 }
1115
1116 static int dev_get_valid_name(struct net *net,
1117                               struct net_device *dev,
1118                               const char *name)
1119 {
1120         BUG_ON(!net);
1121
1122         if (!dev_valid_name(name))
1123                 return -EINVAL;
1124
1125         if (strchr(name, '%'))
1126                 return dev_alloc_name_ns(net, dev, name);
1127         else if (__dev_get_by_name(net, name))
1128                 return -EEXIST;
1129         else if (dev->name != name)
1130                 strlcpy(dev->name, name, IFNAMSIZ);
1131
1132         return 0;
1133 }
1134
1135 /**
1136  *      dev_change_name - change name of a device
1137  *      @dev: device
1138  *      @newname: name (or format string) must be at least IFNAMSIZ
1139  *
1140  *      Change name of a device, can pass format strings "eth%d".
1141  *      for wildcarding.
1142  */
1143 int dev_change_name(struct net_device *dev, const char *newname)
1144 {
1145         unsigned char old_assign_type;
1146         char oldname[IFNAMSIZ];
1147         int err = 0;
1148         int ret;
1149         struct net *net;
1150
1151         ASSERT_RTNL();
1152         BUG_ON(!dev_net(dev));
1153
1154         net = dev_net(dev);
1155         if (dev->flags & IFF_UP)
1156                 return -EBUSY;
1157
1158         write_seqcount_begin(&devnet_rename_seq);
1159
1160         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161                 write_seqcount_end(&devnet_rename_seq);
1162                 return 0;
1163         }
1164
1165         memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167         err = dev_get_valid_name(net, dev, newname);
1168         if (err < 0) {
1169                 write_seqcount_end(&devnet_rename_seq);
1170                 return err;
1171         }
1172
1173         if (oldname[0] && !strchr(oldname, '%'))
1174                 netdev_info(dev, "renamed from %s\n", oldname);
1175
1176         old_assign_type = dev->name_assign_type;
1177         dev->name_assign_type = NET_NAME_RENAMED;
1178
1179 rollback:
1180         ret = device_rename(&dev->dev, dev->name);
1181         if (ret) {
1182                 memcpy(dev->name, oldname, IFNAMSIZ);
1183                 dev->name_assign_type = old_assign_type;
1184                 write_seqcount_end(&devnet_rename_seq);
1185                 return ret;
1186         }
1187
1188         write_seqcount_end(&devnet_rename_seq);
1189
1190         netdev_adjacent_rename_links(dev, oldname);
1191
1192         write_lock_bh(&dev_base_lock);
1193         hlist_del_rcu(&dev->name_hlist);
1194         write_unlock_bh(&dev_base_lock);
1195
1196         synchronize_rcu();
1197
1198         write_lock_bh(&dev_base_lock);
1199         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200         write_unlock_bh(&dev_base_lock);
1201
1202         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203         ret = notifier_to_errno(ret);
1204
1205         if (ret) {
1206                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1207                 if (err >= 0) {
1208                         err = ret;
1209                         write_seqcount_begin(&devnet_rename_seq);
1210                         memcpy(dev->name, oldname, IFNAMSIZ);
1211                         memcpy(oldname, newname, IFNAMSIZ);
1212                         dev->name_assign_type = old_assign_type;
1213                         old_assign_type = NET_NAME_RENAMED;
1214                         goto rollback;
1215                 } else {
1216                         pr_err("%s: name change rollback failed: %d\n",
1217                                dev->name, ret);
1218                 }
1219         }
1220
1221         return err;
1222 }
1223
1224 /**
1225  *      dev_set_alias - change ifalias of a device
1226  *      @dev: device
1227  *      @alias: name up to IFALIASZ
1228  *      @len: limit of bytes to copy from info
1229  *
1230  *      Set ifalias for a device,
1231  */
1232 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233 {
1234         char *new_ifalias;
1235
1236         ASSERT_RTNL();
1237
1238         if (len >= IFALIASZ)
1239                 return -EINVAL;
1240
1241         if (!len) {
1242                 kfree(dev->ifalias);
1243                 dev->ifalias = NULL;
1244                 return 0;
1245         }
1246
1247         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248         if (!new_ifalias)
1249                 return -ENOMEM;
1250         dev->ifalias = new_ifalias;
1251
1252         strlcpy(dev->ifalias, alias, len+1);
1253         return len;
1254 }
1255
1256
1257 /**
1258  *      netdev_features_change - device changes features
1259  *      @dev: device to cause notification
1260  *
1261  *      Called to indicate a device has changed features.
1262  */
1263 void netdev_features_change(struct net_device *dev)
1264 {
1265         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266 }
1267 EXPORT_SYMBOL(netdev_features_change);
1268
1269 /**
1270  *      netdev_state_change - device changes state
1271  *      @dev: device to cause notification
1272  *
1273  *      Called to indicate a device has changed state. This function calls
1274  *      the notifier chains for netdev_chain and sends a NEWLINK message
1275  *      to the routing socket.
1276  */
1277 void netdev_state_change(struct net_device *dev)
1278 {
1279         if (dev->flags & IFF_UP) {
1280                 struct netdev_notifier_change_info change_info;
1281
1282                 change_info.flags_changed = 0;
1283                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284                                               &change_info.info);
1285                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286         }
1287 }
1288 EXPORT_SYMBOL(netdev_state_change);
1289
1290 /**
1291  *      netdev_notify_peers - notify network peers about existence of @dev
1292  *      @dev: network device
1293  *
1294  * Generate traffic such that interested network peers are aware of
1295  * @dev, such as by generating a gratuitous ARP. This may be used when
1296  * a device wants to inform the rest of the network about some sort of
1297  * reconfiguration such as a failover event or virtual machine
1298  * migration.
1299  */
1300 void netdev_notify_peers(struct net_device *dev)
1301 {
1302         rtnl_lock();
1303         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304         rtnl_unlock();
1305 }
1306 EXPORT_SYMBOL(netdev_notify_peers);
1307
1308 static int __dev_open(struct net_device *dev)
1309 {
1310         const struct net_device_ops *ops = dev->netdev_ops;
1311         int ret;
1312
1313         ASSERT_RTNL();
1314
1315         if (!netif_device_present(dev))
1316                 return -ENODEV;
1317
1318         /* Block netpoll from trying to do any rx path servicing.
1319          * If we don't do this there is a chance ndo_poll_controller
1320          * or ndo_poll may be running while we open the device
1321          */
1322         netpoll_poll_disable(dev);
1323
1324         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325         ret = notifier_to_errno(ret);
1326         if (ret)
1327                 return ret;
1328
1329         set_bit(__LINK_STATE_START, &dev->state);
1330
1331         if (ops->ndo_validate_addr)
1332                 ret = ops->ndo_validate_addr(dev);
1333
1334         if (!ret && ops->ndo_open)
1335                 ret = ops->ndo_open(dev);
1336
1337         netpoll_poll_enable(dev);
1338
1339         if (ret)
1340                 clear_bit(__LINK_STATE_START, &dev->state);
1341         else {
1342                 dev->flags |= IFF_UP;
1343                 dev_set_rx_mode(dev);
1344                 dev_activate(dev);
1345                 add_device_randomness(dev->dev_addr, dev->addr_len);
1346         }
1347
1348         return ret;
1349 }
1350
1351 /**
1352  *      dev_open        - prepare an interface for use.
1353  *      @dev:   device to open
1354  *
1355  *      Takes a device from down to up state. The device's private open
1356  *      function is invoked and then the multicast lists are loaded. Finally
1357  *      the device is moved into the up state and a %NETDEV_UP message is
1358  *      sent to the netdev notifier chain.
1359  *
1360  *      Calling this function on an active interface is a nop. On a failure
1361  *      a negative errno code is returned.
1362  */
1363 int dev_open(struct net_device *dev)
1364 {
1365         int ret;
1366
1367         if (dev->flags & IFF_UP)
1368                 return 0;
1369
1370         ret = __dev_open(dev);
1371         if (ret < 0)
1372                 return ret;
1373
1374         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375         call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377         return ret;
1378 }
1379 EXPORT_SYMBOL(dev_open);
1380
1381 static int __dev_close_many(struct list_head *head)
1382 {
1383         struct net_device *dev;
1384
1385         ASSERT_RTNL();
1386         might_sleep();
1387
1388         list_for_each_entry(dev, head, close_list) {
1389                 /* Temporarily disable netpoll until the interface is down */
1390                 netpoll_poll_disable(dev);
1391
1392                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394                 clear_bit(__LINK_STATE_START, &dev->state);
1395
1396                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1397                  * can be even on different cpu. So just clear netif_running().
1398                  *
1399                  * dev->stop() will invoke napi_disable() on all of it's
1400                  * napi_struct instances on this device.
1401                  */
1402                 smp_mb__after_atomic(); /* Commit netif_running(). */
1403         }
1404
1405         dev_deactivate_many(head);
1406
1407         list_for_each_entry(dev, head, close_list) {
1408                 const struct net_device_ops *ops = dev->netdev_ops;
1409
1410                 /*
1411                  *      Call the device specific close. This cannot fail.
1412                  *      Only if device is UP
1413                  *
1414                  *      We allow it to be called even after a DETACH hot-plug
1415                  *      event.
1416                  */
1417                 if (ops->ndo_stop)
1418                         ops->ndo_stop(dev);
1419
1420                 dev->flags &= ~IFF_UP;
1421                 netpoll_poll_enable(dev);
1422         }
1423
1424         return 0;
1425 }
1426
1427 static int __dev_close(struct net_device *dev)
1428 {
1429         int retval;
1430         LIST_HEAD(single);
1431
1432         list_add(&dev->close_list, &single);
1433         retval = __dev_close_many(&single);
1434         list_del(&single);
1435
1436         return retval;
1437 }
1438
1439 int dev_close_many(struct list_head *head, bool unlink)
1440 {
1441         struct net_device *dev, *tmp;
1442
1443         /* Remove the devices that don't need to be closed */
1444         list_for_each_entry_safe(dev, tmp, head, close_list)
1445                 if (!(dev->flags & IFF_UP))
1446                         list_del_init(&dev->close_list);
1447
1448         __dev_close_many(head);
1449
1450         list_for_each_entry_safe(dev, tmp, head, close_list) {
1451                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1453                 if (unlink)
1454                         list_del_init(&dev->close_list);
1455         }
1456
1457         return 0;
1458 }
1459 EXPORT_SYMBOL(dev_close_many);
1460
1461 /**
1462  *      dev_close - shutdown an interface.
1463  *      @dev: device to shutdown
1464  *
1465  *      This function moves an active device into down state. A
1466  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468  *      chain.
1469  */
1470 int dev_close(struct net_device *dev)
1471 {
1472         if (dev->flags & IFF_UP) {
1473                 LIST_HEAD(single);
1474
1475                 list_add(&dev->close_list, &single);
1476                 dev_close_many(&single, true);
1477                 list_del(&single);
1478         }
1479         return 0;
1480 }
1481 EXPORT_SYMBOL(dev_close);
1482
1483
1484 /**
1485  *      dev_disable_lro - disable Large Receive Offload on a device
1486  *      @dev: device
1487  *
1488  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1489  *      called under RTNL.  This is needed if received packets may be
1490  *      forwarded to another interface.
1491  */
1492 void dev_disable_lro(struct net_device *dev)
1493 {
1494         struct net_device *lower_dev;
1495         struct list_head *iter;
1496
1497         dev->wanted_features &= ~NETIF_F_LRO;
1498         netdev_update_features(dev);
1499
1500         if (unlikely(dev->features & NETIF_F_LRO))
1501                 netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503         netdev_for_each_lower_dev(dev, lower_dev, iter)
1504                 dev_disable_lro(lower_dev);
1505 }
1506 EXPORT_SYMBOL(dev_disable_lro);
1507
1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509                                    struct net_device *dev)
1510 {
1511         struct netdev_notifier_info info;
1512
1513         netdev_notifier_info_init(&info, dev);
1514         return nb->notifier_call(nb, val, &info);
1515 }
1516
1517 static int dev_boot_phase = 1;
1518
1519 /**
1520  *      register_netdevice_notifier - register a network notifier block
1521  *      @nb: notifier
1522  *
1523  *      Register a notifier to be called when network device events occur.
1524  *      The notifier passed is linked into the kernel structures and must
1525  *      not be reused until it has been unregistered. A negative errno code
1526  *      is returned on a failure.
1527  *
1528  *      When registered all registration and up events are replayed
1529  *      to the new notifier to allow device to have a race free
1530  *      view of the network device list.
1531  */
1532
1533 int register_netdevice_notifier(struct notifier_block *nb)
1534 {
1535         struct net_device *dev;
1536         struct net_device *last;
1537         struct net *net;
1538         int err;
1539
1540         rtnl_lock();
1541         err = raw_notifier_chain_register(&netdev_chain, nb);
1542         if (err)
1543                 goto unlock;
1544         if (dev_boot_phase)
1545                 goto unlock;
1546         for_each_net(net) {
1547                 for_each_netdev(net, dev) {
1548                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549                         err = notifier_to_errno(err);
1550                         if (err)
1551                                 goto rollback;
1552
1553                         if (!(dev->flags & IFF_UP))
1554                                 continue;
1555
1556                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1557                 }
1558         }
1559
1560 unlock:
1561         rtnl_unlock();
1562         return err;
1563
1564 rollback:
1565         last = dev;
1566         for_each_net(net) {
1567                 for_each_netdev(net, dev) {
1568                         if (dev == last)
1569                                 goto outroll;
1570
1571                         if (dev->flags & IFF_UP) {
1572                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573                                                         dev);
1574                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575                         }
1576                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577                 }
1578         }
1579
1580 outroll:
1581         raw_notifier_chain_unregister(&netdev_chain, nb);
1582         goto unlock;
1583 }
1584 EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586 /**
1587  *      unregister_netdevice_notifier - unregister a network notifier block
1588  *      @nb: notifier
1589  *
1590  *      Unregister a notifier previously registered by
1591  *      register_netdevice_notifier(). The notifier is unlinked into the
1592  *      kernel structures and may then be reused. A negative errno code
1593  *      is returned on a failure.
1594  *
1595  *      After unregistering unregister and down device events are synthesized
1596  *      for all devices on the device list to the removed notifier to remove
1597  *      the need for special case cleanup code.
1598  */
1599
1600 int unregister_netdevice_notifier(struct notifier_block *nb)
1601 {
1602         struct net_device *dev;
1603         struct net *net;
1604         int err;
1605
1606         rtnl_lock();
1607         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608         if (err)
1609                 goto unlock;
1610
1611         for_each_net(net) {
1612                 for_each_netdev(net, dev) {
1613                         if (dev->flags & IFF_UP) {
1614                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615                                                         dev);
1616                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617                         }
1618                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619                 }
1620         }
1621 unlock:
1622         rtnl_unlock();
1623         return err;
1624 }
1625 EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627 /**
1628  *      call_netdevice_notifiers_info - call all network notifier blocks
1629  *      @val: value passed unmodified to notifier function
1630  *      @dev: net_device pointer passed unmodified to notifier function
1631  *      @info: notifier information data
1632  *
1633  *      Call all network notifier blocks.  Parameters and return value
1634  *      are as for raw_notifier_call_chain().
1635  */
1636
1637 static int call_netdevice_notifiers_info(unsigned long val,
1638                                          struct net_device *dev,
1639                                          struct netdev_notifier_info *info)
1640 {
1641         ASSERT_RTNL();
1642         netdev_notifier_info_init(info, dev);
1643         return raw_notifier_call_chain(&netdev_chain, val, info);
1644 }
1645
1646 /**
1647  *      call_netdevice_notifiers - call all network notifier blocks
1648  *      @val: value passed unmodified to notifier function
1649  *      @dev: net_device pointer passed unmodified to notifier function
1650  *
1651  *      Call all network notifier blocks.  Parameters and return value
1652  *      are as for raw_notifier_call_chain().
1653  */
1654
1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656 {
1657         struct netdev_notifier_info info;
1658
1659         return call_netdevice_notifiers_info(val, dev, &info);
1660 }
1661 EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663 #ifdef CONFIG_NET_INGRESS
1664 static struct static_key ingress_needed __read_mostly;
1665
1666 void net_inc_ingress_queue(void)
1667 {
1668         static_key_slow_inc(&ingress_needed);
1669 }
1670 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672 void net_dec_ingress_queue(void)
1673 {
1674         static_key_slow_dec(&ingress_needed);
1675 }
1676 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677 #endif
1678
1679 #ifdef CONFIG_NET_EGRESS
1680 static struct static_key egress_needed __read_mostly;
1681
1682 void net_inc_egress_queue(void)
1683 {
1684         static_key_slow_inc(&egress_needed);
1685 }
1686 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688 void net_dec_egress_queue(void)
1689 {
1690         static_key_slow_dec(&egress_needed);
1691 }
1692 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693 #endif
1694
1695 static struct static_key netstamp_needed __read_mostly;
1696 #ifdef HAVE_JUMP_LABEL
1697 /* We are not allowed to call static_key_slow_dec() from irq context
1698  * If net_disable_timestamp() is called from irq context, defer the
1699  * static_key_slow_dec() calls.
1700  */
1701 static atomic_t netstamp_needed_deferred;
1702 #endif
1703
1704 void net_enable_timestamp(void)
1705 {
1706 #ifdef HAVE_JUMP_LABEL
1707         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709         if (deferred) {
1710                 while (--deferred)
1711                         static_key_slow_dec(&netstamp_needed);
1712                 return;
1713         }
1714 #endif
1715         static_key_slow_inc(&netstamp_needed);
1716 }
1717 EXPORT_SYMBOL(net_enable_timestamp);
1718
1719 void net_disable_timestamp(void)
1720 {
1721 #ifdef HAVE_JUMP_LABEL
1722         if (in_interrupt()) {
1723                 atomic_inc(&netstamp_needed_deferred);
1724                 return;
1725         }
1726 #endif
1727         static_key_slow_dec(&netstamp_needed);
1728 }
1729 EXPORT_SYMBOL(net_disable_timestamp);
1730
1731 static inline void net_timestamp_set(struct sk_buff *skb)
1732 {
1733         skb->tstamp.tv64 = 0;
1734         if (static_key_false(&netstamp_needed))
1735                 __net_timestamp(skb);
1736 }
1737
1738 #define net_timestamp_check(COND, SKB)                  \
1739         if (static_key_false(&netstamp_needed)) {               \
1740                 if ((COND) && !(SKB)->tstamp.tv64)      \
1741                         __net_timestamp(SKB);           \
1742         }                                               \
1743
1744 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1745 {
1746         unsigned int len;
1747
1748         if (!(dev->flags & IFF_UP))
1749                 return false;
1750
1751         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752         if (skb->len <= len)
1753                 return true;
1754
1755         /* if TSO is enabled, we don't care about the length as the packet
1756          * could be forwarded without being segmented before
1757          */
1758         if (skb_is_gso(skb))
1759                 return true;
1760
1761         return false;
1762 }
1763 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766 {
1767         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768             unlikely(!is_skb_forwardable(dev, skb))) {
1769                 atomic_long_inc(&dev->rx_dropped);
1770                 kfree_skb(skb);
1771                 return NET_RX_DROP;
1772         }
1773
1774         skb_scrub_packet(skb, true);
1775         skb->priority = 0;
1776         skb->protocol = eth_type_trans(skb, dev);
1777         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779         return 0;
1780 }
1781 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783 /**
1784  * dev_forward_skb - loopback an skb to another netif
1785  *
1786  * @dev: destination network device
1787  * @skb: buffer to forward
1788  *
1789  * return values:
1790  *      NET_RX_SUCCESS  (no congestion)
1791  *      NET_RX_DROP     (packet was dropped, but freed)
1792  *
1793  * dev_forward_skb can be used for injecting an skb from the
1794  * start_xmit function of one device into the receive queue
1795  * of another device.
1796  *
1797  * The receiving device may be in another namespace, so
1798  * we have to clear all information in the skb that could
1799  * impact namespace isolation.
1800  */
1801 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802 {
1803         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1804 }
1805 EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807 static inline int deliver_skb(struct sk_buff *skb,
1808                               struct packet_type *pt_prev,
1809                               struct net_device *orig_dev)
1810 {
1811         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812                 return -ENOMEM;
1813         atomic_inc(&skb->users);
1814         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815 }
1816
1817 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818                                           struct packet_type **pt,
1819                                           struct net_device *orig_dev,
1820                                           __be16 type,
1821                                           struct list_head *ptype_list)
1822 {
1823         struct packet_type *ptype, *pt_prev = *pt;
1824
1825         list_for_each_entry_rcu(ptype, ptype_list, list) {
1826                 if (ptype->type != type)
1827                         continue;
1828                 if (pt_prev)
1829                         deliver_skb(skb, pt_prev, orig_dev);
1830                 pt_prev = ptype;
1831         }
1832         *pt = pt_prev;
1833 }
1834
1835 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836 {
1837         if (!ptype->af_packet_priv || !skb->sk)
1838                 return false;
1839
1840         if (ptype->id_match)
1841                 return ptype->id_match(ptype, skb->sk);
1842         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843                 return true;
1844
1845         return false;
1846 }
1847
1848 /*
1849  *      Support routine. Sends outgoing frames to any network
1850  *      taps currently in use.
1851  */
1852
1853 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854 {
1855         struct packet_type *ptype;
1856         struct sk_buff *skb2 = NULL;
1857         struct packet_type *pt_prev = NULL;
1858         struct list_head *ptype_list = &ptype_all;
1859
1860         rcu_read_lock();
1861 again:
1862         list_for_each_entry_rcu(ptype, ptype_list, list) {
1863                 /* Never send packets back to the socket
1864                  * they originated from - MvS (miquels@drinkel.ow.org)
1865                  */
1866                 if (skb_loop_sk(ptype, skb))
1867                         continue;
1868
1869                 if (pt_prev) {
1870                         deliver_skb(skb2, pt_prev, skb->dev);
1871                         pt_prev = ptype;
1872                         continue;
1873                 }
1874
1875                 /* need to clone skb, done only once */
1876                 skb2 = skb_clone(skb, GFP_ATOMIC);
1877                 if (!skb2)
1878                         goto out_unlock;
1879
1880                 net_timestamp_set(skb2);
1881
1882                 /* skb->nh should be correctly
1883                  * set by sender, so that the second statement is
1884                  * just protection against buggy protocols.
1885                  */
1886                 skb_reset_mac_header(skb2);
1887
1888                 if (skb_network_header(skb2) < skb2->data ||
1889                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891                                              ntohs(skb2->protocol),
1892                                              dev->name);
1893                         skb_reset_network_header(skb2);
1894                 }
1895
1896                 skb2->transport_header = skb2->network_header;
1897                 skb2->pkt_type = PACKET_OUTGOING;
1898                 pt_prev = ptype;
1899         }
1900
1901         if (ptype_list == &ptype_all) {
1902                 ptype_list = &dev->ptype_all;
1903                 goto again;
1904         }
1905 out_unlock:
1906         if (pt_prev)
1907                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908         rcu_read_unlock();
1909 }
1910
1911 /**
1912  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1913  * @dev: Network device
1914  * @txq: number of queues available
1915  *
1916  * If real_num_tx_queues is changed the tc mappings may no longer be
1917  * valid. To resolve this verify the tc mapping remains valid and if
1918  * not NULL the mapping. With no priorities mapping to this
1919  * offset/count pair it will no longer be used. In the worst case TC0
1920  * is invalid nothing can be done so disable priority mappings. If is
1921  * expected that drivers will fix this mapping if they can before
1922  * calling netif_set_real_num_tx_queues.
1923  */
1924 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1925 {
1926         int i;
1927         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1928
1929         /* If TC0 is invalidated disable TC mapping */
1930         if (tc->offset + tc->count > txq) {
1931                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1932                 dev->num_tc = 0;
1933                 return;
1934         }
1935
1936         /* Invalidated prio to tc mappings set to TC0 */
1937         for (i = 1; i < TC_BITMASK + 1; i++) {
1938                 int q = netdev_get_prio_tc_map(dev, i);
1939
1940                 tc = &dev->tc_to_txq[q];
1941                 if (tc->offset + tc->count > txq) {
1942                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1943                                 i, q);
1944                         netdev_set_prio_tc_map(dev, i, 0);
1945                 }
1946         }
1947 }
1948
1949 #ifdef CONFIG_XPS
1950 static DEFINE_MUTEX(xps_map_mutex);
1951 #define xmap_dereference(P)             \
1952         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1953
1954 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1955                                         int cpu, u16 index)
1956 {
1957         struct xps_map *map = NULL;
1958         int pos;
1959
1960         if (dev_maps)
1961                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1962
1963         for (pos = 0; map && pos < map->len; pos++) {
1964                 if (map->queues[pos] == index) {
1965                         if (map->len > 1) {
1966                                 map->queues[pos] = map->queues[--map->len];
1967                         } else {
1968                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1969                                 kfree_rcu(map, rcu);
1970                                 map = NULL;
1971                         }
1972                         break;
1973                 }
1974         }
1975
1976         return map;
1977 }
1978
1979 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1980 {
1981         struct xps_dev_maps *dev_maps;
1982         int cpu, i;
1983         bool active = false;
1984
1985         mutex_lock(&xps_map_mutex);
1986         dev_maps = xmap_dereference(dev->xps_maps);
1987
1988         if (!dev_maps)
1989                 goto out_no_maps;
1990
1991         for_each_possible_cpu(cpu) {
1992                 for (i = index; i < dev->num_tx_queues; i++) {
1993                         if (!remove_xps_queue(dev_maps, cpu, i))
1994                                 break;
1995                 }
1996                 if (i == dev->num_tx_queues)
1997                         active = true;
1998         }
1999
2000         if (!active) {
2001                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2002                 kfree_rcu(dev_maps, rcu);
2003         }
2004
2005         for (i = index; i < dev->num_tx_queues; i++)
2006                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2007                                              NUMA_NO_NODE);
2008
2009 out_no_maps:
2010         mutex_unlock(&xps_map_mutex);
2011 }
2012
2013 static struct xps_map *expand_xps_map(struct xps_map *map,
2014                                       int cpu, u16 index)
2015 {
2016         struct xps_map *new_map;
2017         int alloc_len = XPS_MIN_MAP_ALLOC;
2018         int i, pos;
2019
2020         for (pos = 0; map && pos < map->len; pos++) {
2021                 if (map->queues[pos] != index)
2022                         continue;
2023                 return map;
2024         }
2025
2026         /* Need to add queue to this CPU's existing map */
2027         if (map) {
2028                 if (pos < map->alloc_len)
2029                         return map;
2030
2031                 alloc_len = map->alloc_len * 2;
2032         }
2033
2034         /* Need to allocate new map to store queue on this CPU's map */
2035         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2036                                cpu_to_node(cpu));
2037         if (!new_map)
2038                 return NULL;
2039
2040         for (i = 0; i < pos; i++)
2041                 new_map->queues[i] = map->queues[i];
2042         new_map->alloc_len = alloc_len;
2043         new_map->len = pos;
2044
2045         return new_map;
2046 }
2047
2048 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2049                         u16 index)
2050 {
2051         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2052         struct xps_map *map, *new_map;
2053         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2054         int cpu, numa_node_id = -2;
2055         bool active = false;
2056
2057         mutex_lock(&xps_map_mutex);
2058
2059         dev_maps = xmap_dereference(dev->xps_maps);
2060
2061         /* allocate memory for queue storage */
2062         for_each_online_cpu(cpu) {
2063                 if (!cpumask_test_cpu(cpu, mask))
2064                         continue;
2065
2066                 if (!new_dev_maps)
2067                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2068                 if (!new_dev_maps) {
2069                         mutex_unlock(&xps_map_mutex);
2070                         return -ENOMEM;
2071                 }
2072
2073                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2074                                  NULL;
2075
2076                 map = expand_xps_map(map, cpu, index);
2077                 if (!map)
2078                         goto error;
2079
2080                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2081         }
2082
2083         if (!new_dev_maps)
2084                 goto out_no_new_maps;
2085
2086         for_each_possible_cpu(cpu) {
2087                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2088                         /* add queue to CPU maps */
2089                         int pos = 0;
2090
2091                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2092                         while ((pos < map->len) && (map->queues[pos] != index))
2093                                 pos++;
2094
2095                         if (pos == map->len)
2096                                 map->queues[map->len++] = index;
2097 #ifdef CONFIG_NUMA
2098                         if (numa_node_id == -2)
2099                                 numa_node_id = cpu_to_node(cpu);
2100                         else if (numa_node_id != cpu_to_node(cpu))
2101                                 numa_node_id = -1;
2102 #endif
2103                 } else if (dev_maps) {
2104                         /* fill in the new device map from the old device map */
2105                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2106                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2107                 }
2108
2109         }
2110
2111         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2112
2113         /* Cleanup old maps */
2114         if (dev_maps) {
2115                 for_each_possible_cpu(cpu) {
2116                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2117                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2118                         if (map && map != new_map)
2119                                 kfree_rcu(map, rcu);
2120                 }
2121
2122                 kfree_rcu(dev_maps, rcu);
2123         }
2124
2125         dev_maps = new_dev_maps;
2126         active = true;
2127
2128 out_no_new_maps:
2129         /* update Tx queue numa node */
2130         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2131                                      (numa_node_id >= 0) ? numa_node_id :
2132                                      NUMA_NO_NODE);
2133
2134         if (!dev_maps)
2135                 goto out_no_maps;
2136
2137         /* removes queue from unused CPUs */
2138         for_each_possible_cpu(cpu) {
2139                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2140                         continue;
2141
2142                 if (remove_xps_queue(dev_maps, cpu, index))
2143                         active = true;
2144         }
2145
2146         /* free map if not active */
2147         if (!active) {
2148                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2149                 kfree_rcu(dev_maps, rcu);
2150         }
2151
2152 out_no_maps:
2153         mutex_unlock(&xps_map_mutex);
2154
2155         return 0;
2156 error:
2157         /* remove any maps that we added */
2158         for_each_possible_cpu(cpu) {
2159                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2160                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2161                                  NULL;
2162                 if (new_map && new_map != map)
2163                         kfree(new_map);
2164         }
2165
2166         mutex_unlock(&xps_map_mutex);
2167
2168         kfree(new_dev_maps);
2169         return -ENOMEM;
2170 }
2171 EXPORT_SYMBOL(netif_set_xps_queue);
2172
2173 #endif
2174 /*
2175  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2176  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2177  */
2178 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2179 {
2180         int rc;
2181
2182         if (txq < 1 || txq > dev->num_tx_queues)
2183                 return -EINVAL;
2184
2185         if (dev->reg_state == NETREG_REGISTERED ||
2186             dev->reg_state == NETREG_UNREGISTERING) {
2187                 ASSERT_RTNL();
2188
2189                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2190                                                   txq);
2191                 if (rc)
2192                         return rc;
2193
2194                 if (dev->num_tc)
2195                         netif_setup_tc(dev, txq);
2196
2197                 if (txq < dev->real_num_tx_queues) {
2198                         qdisc_reset_all_tx_gt(dev, txq);
2199 #ifdef CONFIG_XPS
2200                         netif_reset_xps_queues_gt(dev, txq);
2201 #endif
2202                 }
2203         }
2204
2205         dev->real_num_tx_queues = txq;
2206         return 0;
2207 }
2208 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2209
2210 #ifdef CONFIG_SYSFS
2211 /**
2212  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2213  *      @dev: Network device
2214  *      @rxq: Actual number of RX queues
2215  *
2216  *      This must be called either with the rtnl_lock held or before
2217  *      registration of the net device.  Returns 0 on success, or a
2218  *      negative error code.  If called before registration, it always
2219  *      succeeds.
2220  */
2221 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2222 {
2223         int rc;
2224
2225         if (rxq < 1 || rxq > dev->num_rx_queues)
2226                 return -EINVAL;
2227
2228         if (dev->reg_state == NETREG_REGISTERED) {
2229                 ASSERT_RTNL();
2230
2231                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2232                                                   rxq);
2233                 if (rc)
2234                         return rc;
2235         }
2236
2237         dev->real_num_rx_queues = rxq;
2238         return 0;
2239 }
2240 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2241 #endif
2242
2243 /**
2244  * netif_get_num_default_rss_queues - default number of RSS queues
2245  *
2246  * This routine should set an upper limit on the number of RSS queues
2247  * used by default by multiqueue devices.
2248  */
2249 int netif_get_num_default_rss_queues(void)
2250 {
2251         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2252 }
2253 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2254
2255 static inline void __netif_reschedule(struct Qdisc *q)
2256 {
2257         struct softnet_data *sd;
2258         unsigned long flags;
2259
2260         local_irq_save(flags);
2261         sd = this_cpu_ptr(&softnet_data);
2262         q->next_sched = NULL;
2263         *sd->output_queue_tailp = q;
2264         sd->output_queue_tailp = &q->next_sched;
2265         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2266         local_irq_restore(flags);
2267 }
2268
2269 void __netif_schedule(struct Qdisc *q)
2270 {
2271         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2272                 __netif_reschedule(q);
2273 }
2274 EXPORT_SYMBOL(__netif_schedule);
2275
2276 struct dev_kfree_skb_cb {
2277         enum skb_free_reason reason;
2278 };
2279
2280 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2281 {
2282         return (struct dev_kfree_skb_cb *)skb->cb;
2283 }
2284
2285 void netif_schedule_queue(struct netdev_queue *txq)
2286 {
2287         rcu_read_lock();
2288         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2289                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2290
2291                 __netif_schedule(q);
2292         }
2293         rcu_read_unlock();
2294 }
2295 EXPORT_SYMBOL(netif_schedule_queue);
2296
2297 /**
2298  *      netif_wake_subqueue - allow sending packets on subqueue
2299  *      @dev: network device
2300  *      @queue_index: sub queue index
2301  *
2302  * Resume individual transmit queue of a device with multiple transmit queues.
2303  */
2304 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2305 {
2306         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2307
2308         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2309                 struct Qdisc *q;
2310
2311                 rcu_read_lock();
2312                 q = rcu_dereference(txq->qdisc);
2313                 __netif_schedule(q);
2314                 rcu_read_unlock();
2315         }
2316 }
2317 EXPORT_SYMBOL(netif_wake_subqueue);
2318
2319 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2320 {
2321         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2322                 struct Qdisc *q;
2323
2324                 rcu_read_lock();
2325                 q = rcu_dereference(dev_queue->qdisc);
2326                 __netif_schedule(q);
2327                 rcu_read_unlock();
2328         }
2329 }
2330 EXPORT_SYMBOL(netif_tx_wake_queue);
2331
2332 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2333 {
2334         unsigned long flags;
2335
2336         if (likely(atomic_read(&skb->users) == 1)) {
2337                 smp_rmb();
2338                 atomic_set(&skb->users, 0);
2339         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2340                 return;
2341         }
2342         get_kfree_skb_cb(skb)->reason = reason;
2343         local_irq_save(flags);
2344         skb->next = __this_cpu_read(softnet_data.completion_queue);
2345         __this_cpu_write(softnet_data.completion_queue, skb);
2346         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2347         local_irq_restore(flags);
2348 }
2349 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2350
2351 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2352 {
2353         if (in_irq() || irqs_disabled())
2354                 __dev_kfree_skb_irq(skb, reason);
2355         else
2356                 dev_kfree_skb(skb);
2357 }
2358 EXPORT_SYMBOL(__dev_kfree_skb_any);
2359
2360
2361 /**
2362  * netif_device_detach - mark device as removed
2363  * @dev: network device
2364  *
2365  * Mark device as removed from system and therefore no longer available.
2366  */
2367 void netif_device_detach(struct net_device *dev)
2368 {
2369         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2370             netif_running(dev)) {
2371                 netif_tx_stop_all_queues(dev);
2372         }
2373 }
2374 EXPORT_SYMBOL(netif_device_detach);
2375
2376 /**
2377  * netif_device_attach - mark device as attached
2378  * @dev: network device
2379  *
2380  * Mark device as attached from system and restart if needed.
2381  */
2382 void netif_device_attach(struct net_device *dev)
2383 {
2384         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2385             netif_running(dev)) {
2386                 netif_tx_wake_all_queues(dev);
2387                 __netdev_watchdog_up(dev);
2388         }
2389 }
2390 EXPORT_SYMBOL(netif_device_attach);
2391
2392 /*
2393  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2394  * to be used as a distribution range.
2395  */
2396 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2397                   unsigned int num_tx_queues)
2398 {
2399         u32 hash;
2400         u16 qoffset = 0;
2401         u16 qcount = num_tx_queues;
2402
2403         if (skb_rx_queue_recorded(skb)) {
2404                 hash = skb_get_rx_queue(skb);
2405                 while (unlikely(hash >= num_tx_queues))
2406                         hash -= num_tx_queues;
2407                 return hash;
2408         }
2409
2410         if (dev->num_tc) {
2411                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2412                 qoffset = dev->tc_to_txq[tc].offset;
2413                 qcount = dev->tc_to_txq[tc].count;
2414         }
2415
2416         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2417 }
2418 EXPORT_SYMBOL(__skb_tx_hash);
2419
2420 static void skb_warn_bad_offload(const struct sk_buff *skb)
2421 {
2422         static const netdev_features_t null_features = 0;
2423         struct net_device *dev = skb->dev;
2424         const char *name = "";
2425
2426         if (!net_ratelimit())
2427                 return;
2428
2429         if (dev) {
2430                 if (dev->dev.parent)
2431                         name = dev_driver_string(dev->dev.parent);
2432                 else
2433                         name = netdev_name(dev);
2434         }
2435         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2436              "gso_type=%d ip_summed=%d\n",
2437              name, dev ? &dev->features : &null_features,
2438              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2439              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2440              skb_shinfo(skb)->gso_type, skb->ip_summed);
2441 }
2442
2443 /*
2444  * Invalidate hardware checksum when packet is to be mangled, and
2445  * complete checksum manually on outgoing path.
2446  */
2447 int skb_checksum_help(struct sk_buff *skb)
2448 {
2449         __wsum csum;
2450         int ret = 0, offset;
2451
2452         if (skb->ip_summed == CHECKSUM_COMPLETE)
2453                 goto out_set_summed;
2454
2455         if (unlikely(skb_shinfo(skb)->gso_size)) {
2456                 skb_warn_bad_offload(skb);
2457                 return -EINVAL;
2458         }
2459
2460         /* Before computing a checksum, we should make sure no frag could
2461          * be modified by an external entity : checksum could be wrong.
2462          */
2463         if (skb_has_shared_frag(skb)) {
2464                 ret = __skb_linearize(skb);
2465                 if (ret)
2466                         goto out;
2467         }
2468
2469         offset = skb_checksum_start_offset(skb);
2470         BUG_ON(offset >= skb_headlen(skb));
2471         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2472
2473         offset += skb->csum_offset;
2474         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2475
2476         if (skb_cloned(skb) &&
2477             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2478                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2479                 if (ret)
2480                         goto out;
2481         }
2482
2483         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2484 out_set_summed:
2485         skb->ip_summed = CHECKSUM_NONE;
2486 out:
2487         return ret;
2488 }
2489 EXPORT_SYMBOL(skb_checksum_help);
2490
2491 /* skb_csum_offload_check - Driver helper function to determine if a device
2492  * with limited checksum offload capabilities is able to offload the checksum
2493  * for a given packet.
2494  *
2495  * Arguments:
2496  *   skb - sk_buff for the packet in question
2497  *   spec - contains the description of what device can offload
2498  *   csum_encapped - returns true if the checksum being offloaded is
2499  *            encpasulated. That is it is checksum for the transport header
2500  *            in the inner headers.
2501  *   checksum_help - when set indicates that helper function should
2502  *            call skb_checksum_help if offload checks fail
2503  *
2504  * Returns:
2505  *   true: Packet has passed the checksum checks and should be offloadable to
2506  *         the device (a driver may still need to check for additional
2507  *         restrictions of its device)
2508  *   false: Checksum is not offloadable. If checksum_help was set then
2509  *         skb_checksum_help was called to resolve checksum for non-GSO
2510  *         packets and when IP protocol is not SCTP
2511  */
2512 bool __skb_csum_offload_chk(struct sk_buff *skb,
2513                             const struct skb_csum_offl_spec *spec,
2514                             bool *csum_encapped,
2515                             bool csum_help)
2516 {
2517         struct iphdr *iph;
2518         struct ipv6hdr *ipv6;
2519         void *nhdr;
2520         int protocol;
2521         u8 ip_proto;
2522
2523         if (skb->protocol == htons(ETH_P_8021Q) ||
2524             skb->protocol == htons(ETH_P_8021AD)) {
2525                 if (!spec->vlan_okay)
2526                         goto need_help;
2527         }
2528
2529         /* We check whether the checksum refers to a transport layer checksum in
2530          * the outermost header or an encapsulated transport layer checksum that
2531          * corresponds to the inner headers of the skb. If the checksum is for
2532          * something else in the packet we need help.
2533          */
2534         if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2535                 /* Non-encapsulated checksum */
2536                 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2537                 nhdr = skb_network_header(skb);
2538                 *csum_encapped = false;
2539                 if (spec->no_not_encapped)
2540                         goto need_help;
2541         } else if (skb->encapsulation && spec->encap_okay &&
2542                    skb_checksum_start_offset(skb) ==
2543                    skb_inner_transport_offset(skb)) {
2544                 /* Encapsulated checksum */
2545                 *csum_encapped = true;
2546                 switch (skb->inner_protocol_type) {
2547                 case ENCAP_TYPE_ETHER:
2548                         protocol = eproto_to_ipproto(skb->inner_protocol);
2549                         break;
2550                 case ENCAP_TYPE_IPPROTO:
2551                         protocol = skb->inner_protocol;
2552                         break;
2553                 }
2554                 nhdr = skb_inner_network_header(skb);
2555         } else {
2556                 goto need_help;
2557         }
2558
2559         switch (protocol) {
2560         case IPPROTO_IP:
2561                 if (!spec->ipv4_okay)
2562                         goto need_help;
2563                 iph = nhdr;
2564                 ip_proto = iph->protocol;
2565                 if (iph->ihl != 5 && !spec->ip_options_okay)
2566                         goto need_help;
2567                 break;
2568         case IPPROTO_IPV6:
2569                 if (!spec->ipv6_okay)
2570                         goto need_help;
2571                 if (spec->no_encapped_ipv6 && *csum_encapped)
2572                         goto need_help;
2573                 ipv6 = nhdr;
2574                 nhdr += sizeof(*ipv6);
2575                 ip_proto = ipv6->nexthdr;
2576                 break;
2577         default:
2578                 goto need_help;
2579         }
2580
2581 ip_proto_again:
2582         switch (ip_proto) {
2583         case IPPROTO_TCP:
2584                 if (!spec->tcp_okay ||
2585                     skb->csum_offset != offsetof(struct tcphdr, check))
2586                         goto need_help;
2587                 break;
2588         case IPPROTO_UDP:
2589                 if (!spec->udp_okay ||
2590                     skb->csum_offset != offsetof(struct udphdr, check))
2591                         goto need_help;
2592                 break;
2593         case IPPROTO_SCTP:
2594                 if (!spec->sctp_okay ||
2595                     skb->csum_offset != offsetof(struct sctphdr, checksum))
2596                         goto cant_help;
2597                 break;
2598         case NEXTHDR_HOP:
2599         case NEXTHDR_ROUTING:
2600         case NEXTHDR_DEST: {
2601                 u8 *opthdr = nhdr;
2602
2603                 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2604                         goto need_help;
2605
2606                 ip_proto = opthdr[0];
2607                 nhdr += (opthdr[1] + 1) << 3;
2608
2609                 goto ip_proto_again;
2610         }
2611         default:
2612                 goto need_help;
2613         }
2614
2615         /* Passed the tests for offloading checksum */
2616         return true;
2617
2618 need_help:
2619         if (csum_help && !skb_shinfo(skb)->gso_size)
2620                 skb_checksum_help(skb);
2621 cant_help:
2622         return false;
2623 }
2624 EXPORT_SYMBOL(__skb_csum_offload_chk);
2625
2626 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2627 {
2628         __be16 type = skb->protocol;
2629
2630         /* Tunnel gso handlers can set protocol to ethernet. */
2631         if (type == htons(ETH_P_TEB)) {
2632                 struct ethhdr *eth;
2633
2634                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2635                         return 0;
2636
2637                 eth = (struct ethhdr *)skb_mac_header(skb);
2638                 type = eth->h_proto;
2639         }
2640
2641         return __vlan_get_protocol(skb, type, depth);
2642 }
2643
2644 /**
2645  *      skb_mac_gso_segment - mac layer segmentation handler.
2646  *      @skb: buffer to segment
2647  *      @features: features for the output path (see dev->features)
2648  */
2649 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2650                                     netdev_features_t features)
2651 {
2652         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2653         struct packet_offload *ptype;
2654         int vlan_depth = skb->mac_len;
2655         __be16 type = skb_network_protocol(skb, &vlan_depth);
2656
2657         if (unlikely(!type))
2658                 return ERR_PTR(-EINVAL);
2659
2660         __skb_pull(skb, vlan_depth);
2661
2662         rcu_read_lock();
2663         list_for_each_entry_rcu(ptype, &offload_base, list) {
2664                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2665                         segs = ptype->callbacks.gso_segment(skb, features);
2666                         break;
2667                 }
2668         }
2669         rcu_read_unlock();
2670
2671         __skb_push(skb, skb->data - skb_mac_header(skb));
2672
2673         return segs;
2674 }
2675 EXPORT_SYMBOL(skb_mac_gso_segment);
2676
2677
2678 /* openvswitch calls this on rx path, so we need a different check.
2679  */
2680 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2681 {
2682         if (tx_path)
2683                 return skb->ip_summed != CHECKSUM_PARTIAL;
2684         else
2685                 return skb->ip_summed == CHECKSUM_NONE;
2686 }
2687
2688 /**
2689  *      __skb_gso_segment - Perform segmentation on skb.
2690  *      @skb: buffer to segment
2691  *      @features: features for the output path (see dev->features)
2692  *      @tx_path: whether it is called in TX path
2693  *
2694  *      This function segments the given skb and returns a list of segments.
2695  *
2696  *      It may return NULL if the skb requires no segmentation.  This is
2697  *      only possible when GSO is used for verifying header integrity.
2698  *
2699  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2700  */
2701 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2702                                   netdev_features_t features, bool tx_path)
2703 {
2704         if (unlikely(skb_needs_check(skb, tx_path))) {
2705                 int err;
2706
2707                 skb_warn_bad_offload(skb);
2708
2709                 err = skb_cow_head(skb, 0);
2710                 if (err < 0)
2711                         return ERR_PTR(err);
2712         }
2713
2714         /* Only report GSO partial support if it will enable us to
2715          * support segmentation on this frame without needing additional
2716          * work.
2717          */
2718         if (features & NETIF_F_GSO_PARTIAL) {
2719                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2720                 struct net_device *dev = skb->dev;
2721
2722                 partial_features |= dev->features & dev->gso_partial_features;
2723                 if (!skb_gso_ok(skb, features | partial_features))
2724                         features &= ~NETIF_F_GSO_PARTIAL;
2725         }
2726
2727         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2728                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2729
2730         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2731         SKB_GSO_CB(skb)->encap_level = 0;
2732
2733         skb_reset_mac_header(skb);
2734         skb_reset_mac_len(skb);
2735
2736         return skb_mac_gso_segment(skb, features);
2737 }
2738 EXPORT_SYMBOL(__skb_gso_segment);
2739
2740 /* Take action when hardware reception checksum errors are detected. */
2741 #ifdef CONFIG_BUG
2742 void netdev_rx_csum_fault(struct net_device *dev)
2743 {
2744         if (net_ratelimit()) {
2745                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2746                 dump_stack();
2747         }
2748 }
2749 EXPORT_SYMBOL(netdev_rx_csum_fault);
2750 #endif
2751
2752 /* Actually, we should eliminate this check as soon as we know, that:
2753  * 1. IOMMU is present and allows to map all the memory.
2754  * 2. No high memory really exists on this machine.
2755  */
2756
2757 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2758 {
2759 #ifdef CONFIG_HIGHMEM
2760         int i;
2761         if (!(dev->features & NETIF_F_HIGHDMA)) {
2762                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2763                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2764                         if (PageHighMem(skb_frag_page(frag)))
2765                                 return 1;
2766                 }
2767         }
2768
2769         if (PCI_DMA_BUS_IS_PHYS) {
2770                 struct device *pdev = dev->dev.parent;
2771
2772                 if (!pdev)
2773                         return 0;
2774                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2775                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2776                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2777                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2778                                 return 1;
2779                 }
2780         }
2781 #endif
2782         return 0;
2783 }
2784
2785 /* If MPLS offload request, verify we are testing hardware MPLS features
2786  * instead of standard features for the netdev.
2787  */
2788 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2789 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2790                                            netdev_features_t features,
2791                                            __be16 type)
2792 {
2793         if (eth_p_mpls(type))
2794                 features &= skb->dev->mpls_features;
2795
2796         return features;
2797 }
2798 #else
2799 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2800                                            netdev_features_t features,
2801                                            __be16 type)
2802 {
2803         return features;
2804 }
2805 #endif
2806
2807 static netdev_features_t harmonize_features(struct sk_buff *skb,
2808         netdev_features_t features)
2809 {
2810         int tmp;
2811         __be16 type;
2812
2813         type = skb_network_protocol(skb, &tmp);
2814         features = net_mpls_features(skb, features, type);
2815
2816         if (skb->ip_summed != CHECKSUM_NONE &&
2817             !can_checksum_protocol(features, type)) {
2818                 features &= ~NETIF_F_CSUM_MASK;
2819         } else if (illegal_highdma(skb->dev, skb)) {
2820                 features &= ~NETIF_F_SG;
2821         }
2822
2823         return features;
2824 }
2825
2826 netdev_features_t passthru_features_check(struct sk_buff *skb,
2827                                           struct net_device *dev,
2828                                           netdev_features_t features)
2829 {
2830         return features;
2831 }
2832 EXPORT_SYMBOL(passthru_features_check);
2833
2834 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2835                                              struct net_device *dev,
2836                                              netdev_features_t features)
2837 {
2838         return vlan_features_check(skb, features);
2839 }
2840
2841 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2842                                             struct net_device *dev,
2843                                             netdev_features_t features)
2844 {
2845         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2846
2847         if (gso_segs > dev->gso_max_segs)
2848                 return features & ~NETIF_F_GSO_MASK;
2849
2850         /* Support for GSO partial features requires software
2851          * intervention before we can actually process the packets
2852          * so we need to strip support for any partial features now
2853          * and we can pull them back in after we have partially
2854          * segmented the frame.
2855          */
2856         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2857                 features &= ~dev->gso_partial_features;
2858
2859         /* Make sure to clear the IPv4 ID mangling feature if the
2860          * IPv4 header has the potential to be fragmented.
2861          */
2862         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2863                 struct iphdr *iph = skb->encapsulation ?
2864                                     inner_ip_hdr(skb) : ip_hdr(skb);
2865
2866                 if (!(iph->frag_off & htons(IP_DF)))
2867                         features &= ~NETIF_F_TSO_MANGLEID;
2868         }
2869
2870         return features;
2871 }
2872
2873 netdev_features_t netif_skb_features(struct sk_buff *skb)
2874 {
2875         struct net_device *dev = skb->dev;
2876         netdev_features_t features = dev->features;
2877
2878         if (skb_is_gso(skb))
2879                 features = gso_features_check(skb, dev, features);
2880
2881         /* If encapsulation offload request, verify we are testing
2882          * hardware encapsulation features instead of standard
2883          * features for the netdev
2884          */
2885         if (skb->encapsulation)
2886                 features &= dev->hw_enc_features;
2887
2888         if (skb_vlan_tagged(skb))
2889                 features = netdev_intersect_features(features,
2890                                                      dev->vlan_features |
2891                                                      NETIF_F_HW_VLAN_CTAG_TX |
2892                                                      NETIF_F_HW_VLAN_STAG_TX);
2893
2894         if (dev->netdev_ops->ndo_features_check)
2895                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2896                                                                 features);
2897         else
2898                 features &= dflt_features_check(skb, dev, features);
2899
2900         return harmonize_features(skb, features);
2901 }
2902 EXPORT_SYMBOL(netif_skb_features);
2903
2904 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2905                     struct netdev_queue *txq, bool more)
2906 {
2907         unsigned int len;
2908         int rc;
2909
2910         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2911                 dev_queue_xmit_nit(skb, dev);
2912
2913         len = skb->len;
2914         trace_net_dev_start_xmit(skb, dev);
2915         rc = netdev_start_xmit(skb, dev, txq, more);
2916         trace_net_dev_xmit(skb, rc, dev, len);
2917
2918         return rc;
2919 }
2920
2921 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2922                                     struct netdev_queue *txq, int *ret)
2923 {
2924         struct sk_buff *skb = first;
2925         int rc = NETDEV_TX_OK;
2926
2927         while (skb) {
2928                 struct sk_buff *next = skb->next;
2929
2930                 skb->next = NULL;
2931                 rc = xmit_one(skb, dev, txq, next != NULL);
2932                 if (unlikely(!dev_xmit_complete(rc))) {
2933                         skb->next = next;
2934                         goto out;
2935                 }
2936
2937                 skb = next;
2938                 if (netif_xmit_stopped(txq) && skb) {
2939                         rc = NETDEV_TX_BUSY;
2940                         break;
2941                 }
2942         }
2943
2944 out:
2945         *ret = rc;
2946         return skb;
2947 }
2948
2949 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2950                                           netdev_features_t features)
2951 {
2952         if (skb_vlan_tag_present(skb) &&
2953             !vlan_hw_offload_capable(features, skb->vlan_proto))
2954                 skb = __vlan_hwaccel_push_inside(skb);
2955         return skb;
2956 }
2957
2958 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2959 {
2960         netdev_features_t features;
2961
2962         features = netif_skb_features(skb);
2963         skb = validate_xmit_vlan(skb, features);
2964         if (unlikely(!skb))
2965                 goto out_null;
2966
2967         if (netif_needs_gso(skb, features)) {
2968                 struct sk_buff *segs;
2969
2970                 segs = skb_gso_segment(skb, features);
2971                 if (IS_ERR(segs)) {
2972                         goto out_kfree_skb;
2973                 } else if (segs) {
2974                         consume_skb(skb);
2975                         skb = segs;
2976                 }
2977         } else {
2978                 if (skb_needs_linearize(skb, features) &&
2979                     __skb_linearize(skb))
2980                         goto out_kfree_skb;
2981
2982                 /* If packet is not checksummed and device does not
2983                  * support checksumming for this protocol, complete
2984                  * checksumming here.
2985                  */
2986                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2987                         if (skb->encapsulation)
2988                                 skb_set_inner_transport_header(skb,
2989                                                                skb_checksum_start_offset(skb));
2990                         else
2991                                 skb_set_transport_header(skb,
2992                                                          skb_checksum_start_offset(skb));
2993                         if (!(features & NETIF_F_CSUM_MASK) &&
2994                             skb_checksum_help(skb))
2995                                 goto out_kfree_skb;
2996                 }
2997         }
2998
2999         return skb;
3000
3001 out_kfree_skb:
3002         kfree_skb(skb);
3003 out_null:
3004         atomic_long_inc(&dev->tx_dropped);
3005         return NULL;
3006 }
3007
3008 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3009 {
3010         struct sk_buff *next, *head = NULL, *tail;
3011
3012         for (; skb != NULL; skb = next) {
3013                 next = skb->next;
3014                 skb->next = NULL;
3015
3016                 /* in case skb wont be segmented, point to itself */
3017                 skb->prev = skb;
3018
3019                 skb = validate_xmit_skb(skb, dev);
3020                 if (!skb)
3021                         continue;
3022
3023                 if (!head)
3024                         head = skb;
3025                 else
3026                         tail->next = skb;
3027                 /* If skb was segmented, skb->prev points to
3028                  * the last segment. If not, it still contains skb.
3029                  */
3030                 tail = skb->prev;
3031         }
3032         return head;
3033 }
3034
3035 static void qdisc_pkt_len_init(struct sk_buff *skb)
3036 {
3037         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3038
3039         qdisc_skb_cb(skb)->pkt_len = skb->len;
3040
3041         /* To get more precise estimation of bytes sent on wire,
3042          * we add to pkt_len the headers size of all segments
3043          */
3044         if (shinfo->gso_size)  {
3045                 unsigned int hdr_len;
3046                 u16 gso_segs = shinfo->gso_segs;
3047
3048                 /* mac layer + network layer */
3049                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3050
3051                 /* + transport layer */
3052                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3053                         hdr_len += tcp_hdrlen(skb);
3054                 else
3055                         hdr_len += sizeof(struct udphdr);
3056
3057                 if (shinfo->gso_type & SKB_GSO_DODGY)
3058                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3059                                                 shinfo->gso_size);
3060
3061                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3062         }
3063 }
3064
3065 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3066                                  struct net_device *dev,
3067                                  struct netdev_queue *txq)
3068 {
3069         spinlock_t *root_lock = qdisc_lock(q);
3070         bool contended;
3071         int rc;
3072
3073         qdisc_calculate_pkt_len(skb, q);
3074         /*
3075          * Heuristic to force contended enqueues to serialize on a
3076          * separate lock before trying to get qdisc main lock.
3077          * This permits __QDISC___STATE_RUNNING owner to get the lock more
3078          * often and dequeue packets faster.
3079          */
3080         contended = qdisc_is_running(q);
3081         if (unlikely(contended))
3082                 spin_lock(&q->busylock);
3083
3084         spin_lock(root_lock);
3085         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3086                 kfree_skb(skb);
3087                 rc = NET_XMIT_DROP;
3088         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3089                    qdisc_run_begin(q)) {
3090                 /*
3091                  * This is a work-conserving queue; there are no old skbs
3092                  * waiting to be sent out; and the qdisc is not running -
3093                  * xmit the skb directly.
3094                  */
3095
3096                 qdisc_bstats_update(q, skb);
3097
3098                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3099                         if (unlikely(contended)) {
3100                                 spin_unlock(&q->busylock);
3101                                 contended = false;
3102                         }
3103                         __qdisc_run(q);
3104                 } else
3105                         qdisc_run_end(q);
3106
3107                 rc = NET_XMIT_SUCCESS;
3108         } else {
3109                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3110                 if (qdisc_run_begin(q)) {
3111                         if (unlikely(contended)) {
3112                                 spin_unlock(&q->busylock);
3113                                 contended = false;
3114                         }
3115                         __qdisc_run(q);
3116                 }
3117         }
3118         spin_unlock(root_lock);
3119         if (unlikely(contended))
3120                 spin_unlock(&q->busylock);
3121         return rc;
3122 }
3123
3124 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3125 static void skb_update_prio(struct sk_buff *skb)
3126 {
3127         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3128
3129         if (!skb->priority && skb->sk && map) {
3130                 unsigned int prioidx =
3131                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3132
3133                 if (prioidx < map->priomap_len)
3134                         skb->priority = map->priomap[prioidx];
3135         }
3136 }
3137 #else
3138 #define skb_update_prio(skb)
3139 #endif
3140
3141 DEFINE_PER_CPU(int, xmit_recursion);
3142 EXPORT_SYMBOL(xmit_recursion);
3143
3144 #define RECURSION_LIMIT 10
3145
3146 /**
3147  *      dev_loopback_xmit - loop back @skb
3148  *      @net: network namespace this loopback is happening in
3149  *      @sk:  sk needed to be a netfilter okfn
3150  *      @skb: buffer to transmit
3151  */
3152 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3153 {
3154         skb_reset_mac_header(skb);
3155         __skb_pull(skb, skb_network_offset(skb));
3156         skb->pkt_type = PACKET_LOOPBACK;
3157         skb->ip_summed = CHECKSUM_UNNECESSARY;
3158         WARN_ON(!skb_dst(skb));
3159         skb_dst_force(skb);
3160         netif_rx_ni(skb);
3161         return 0;
3162 }
3163 EXPORT_SYMBOL(dev_loopback_xmit);
3164
3165 #ifdef CONFIG_NET_EGRESS
3166 static struct sk_buff *
3167 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3168 {
3169         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3170         struct tcf_result cl_res;
3171
3172         if (!cl)
3173                 return skb;
3174
3175         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3176          * earlier by the caller.
3177          */
3178         qdisc_bstats_cpu_update(cl->q, skb);
3179
3180         switch (tc_classify(skb, cl, &cl_res, false)) {
3181         case TC_ACT_OK:
3182         case TC_ACT_RECLASSIFY:
3183                 skb->tc_index = TC_H_MIN(cl_res.classid);
3184                 break;
3185         case TC_ACT_SHOT:
3186                 qdisc_qstats_cpu_drop(cl->q);
3187                 *ret = NET_XMIT_DROP;
3188                 goto drop;
3189         case TC_ACT_STOLEN:
3190         case TC_ACT_QUEUED:
3191                 *ret = NET_XMIT_SUCCESS;
3192 drop:
3193                 kfree_skb(skb);
3194                 return NULL;
3195         case TC_ACT_REDIRECT:
3196                 /* No need to push/pop skb's mac_header here on egress! */
3197                 skb_do_redirect(skb);
3198                 *ret = NET_XMIT_SUCCESS;
3199                 return NULL;
3200         default:
3201                 break;
3202         }
3203
3204         return skb;
3205 }
3206 #endif /* CONFIG_NET_EGRESS */
3207
3208 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3209 {
3210 #ifdef CONFIG_XPS
3211         struct xps_dev_maps *dev_maps;
3212         struct xps_map *map;
3213         int queue_index = -1;
3214
3215         rcu_read_lock();
3216         dev_maps = rcu_dereference(dev->xps_maps);
3217         if (dev_maps) {
3218                 map = rcu_dereference(
3219                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3220                 if (map) {
3221                         if (map->len == 1)
3222                                 queue_index = map->queues[0];
3223                         else
3224                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3225                                                                            map->len)];
3226                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3227                                 queue_index = -1;
3228                 }
3229         }
3230         rcu_read_unlock();
3231
3232         return queue_index;
3233 #else
3234         return -1;
3235 #endif
3236 }
3237
3238 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3239 {
3240         struct sock *sk = skb->sk;
3241         int queue_index = sk_tx_queue_get(sk);
3242
3243         if (queue_index < 0 || skb->ooo_okay ||
3244             queue_index >= dev->real_num_tx_queues) {
3245                 int new_index = get_xps_queue(dev, skb);
3246                 if (new_index < 0)
3247                         new_index = skb_tx_hash(dev, skb);
3248
3249                 if (queue_index != new_index && sk &&
3250                     sk_fullsock(sk) &&
3251                     rcu_access_pointer(sk->sk_dst_cache))
3252                         sk_tx_queue_set(sk, new_index);
3253
3254                 queue_index = new_index;
3255         }
3256
3257         return queue_index;
3258 }
3259
3260 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3261                                     struct sk_buff *skb,
3262                                     void *accel_priv)
3263 {
3264         int queue_index = 0;
3265
3266 #ifdef CONFIG_XPS
3267         u32 sender_cpu = skb->sender_cpu - 1;
3268
3269         if (sender_cpu >= (u32)NR_CPUS)
3270                 skb->sender_cpu = raw_smp_processor_id() + 1;
3271 #endif
3272
3273         if (dev->real_num_tx_queues != 1) {
3274                 const struct net_device_ops *ops = dev->netdev_ops;
3275                 if (ops->ndo_select_queue)
3276                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3277                                                             __netdev_pick_tx);
3278                 else
3279                         queue_index = __netdev_pick_tx(dev, skb);
3280
3281                 if (!accel_priv)
3282                         queue_index = netdev_cap_txqueue(dev, queue_index);
3283         }
3284
3285         skb_set_queue_mapping(skb, queue_index);
3286         return netdev_get_tx_queue(dev, queue_index);
3287 }
3288
3289 /**
3290  *      __dev_queue_xmit - transmit a buffer
3291  *      @skb: buffer to transmit
3292  *      @accel_priv: private data used for L2 forwarding offload
3293  *
3294  *      Queue a buffer for transmission to a network device. The caller must
3295  *      have set the device and priority and built the buffer before calling
3296  *      this function. The function can be called from an interrupt.
3297  *
3298  *      A negative errno code is returned on a failure. A success does not
3299  *      guarantee the frame will be transmitted as it may be dropped due
3300  *      to congestion or traffic shaping.
3301  *
3302  * -----------------------------------------------------------------------------------
3303  *      I notice this method can also return errors from the queue disciplines,
3304  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3305  *      be positive.
3306  *
3307  *      Regardless of the return value, the skb is consumed, so it is currently
3308  *      difficult to retry a send to this method.  (You can bump the ref count
3309  *      before sending to hold a reference for retry if you are careful.)
3310  *
3311  *      When calling this method, interrupts MUST be enabled.  This is because
3312  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3313  *          --BLG
3314  */
3315 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3316 {
3317         struct net_device *dev = skb->dev;
3318         struct netdev_queue *txq;
3319         struct Qdisc *q;
3320         int rc = -ENOMEM;
3321
3322         skb_reset_mac_header(skb);
3323
3324         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3325                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3326
3327         /* Disable soft irqs for various locks below. Also
3328          * stops preemption for RCU.
3329          */
3330         rcu_read_lock_bh();
3331
3332         skb_update_prio(skb);
3333
3334         qdisc_pkt_len_init(skb);
3335 #ifdef CONFIG_NET_CLS_ACT
3336         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3337 # ifdef CONFIG_NET_EGRESS
3338         if (static_key_false(&egress_needed)) {
3339                 skb = sch_handle_egress(skb, &rc, dev);
3340                 if (!skb)
3341                         goto out;
3342         }
3343 # endif
3344 #endif
3345         /* If device/qdisc don't need skb->dst, release it right now while
3346          * its hot in this cpu cache.
3347          */
3348         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3349                 skb_dst_drop(skb);
3350         else
3351                 skb_dst_force(skb);
3352
3353 #ifdef CONFIG_NET_SWITCHDEV
3354         /* Don't forward if offload device already forwarded */
3355         if (skb->offload_fwd_mark &&
3356             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3357                 consume_skb(skb);
3358                 rc = NET_XMIT_SUCCESS;
3359                 goto out;
3360         }
3361 #endif
3362
3363         txq = netdev_pick_tx(dev, skb, accel_priv);
3364         q = rcu_dereference_bh(txq->qdisc);
3365
3366         trace_net_dev_queue(skb);
3367         if (q->enqueue) {
3368                 rc = __dev_xmit_skb(skb, q, dev, txq);
3369                 goto out;
3370         }
3371
3372         /* The device has no queue. Common case for software devices:
3373            loopback, all the sorts of tunnels...
3374
3375            Really, it is unlikely that netif_tx_lock protection is necessary
3376            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3377            counters.)
3378            However, it is possible, that they rely on protection
3379            made by us here.
3380
3381            Check this and shot the lock. It is not prone from deadlocks.
3382            Either shot noqueue qdisc, it is even simpler 8)
3383          */
3384         if (dev->flags & IFF_UP) {
3385                 int cpu = smp_processor_id(); /* ok because BHs are off */
3386
3387                 if (txq->xmit_lock_owner != cpu) {
3388
3389                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3390                                 goto recursion_alert;
3391
3392                         skb = validate_xmit_skb(skb, dev);
3393                         if (!skb)
3394                                 goto out;
3395
3396                         HARD_TX_LOCK(dev, txq, cpu);
3397
3398                         if (!netif_xmit_stopped(txq)) {
3399                                 __this_cpu_inc(xmit_recursion);
3400                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3401                                 __this_cpu_dec(xmit_recursion);
3402                                 if (dev_xmit_complete(rc)) {
3403                                         HARD_TX_UNLOCK(dev, txq);
3404                                         goto out;
3405                                 }
3406                         }
3407                         HARD_TX_UNLOCK(dev, txq);
3408                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3409                                              dev->name);
3410                 } else {
3411                         /* Recursion is detected! It is possible,
3412                          * unfortunately
3413                          */
3414 recursion_alert:
3415                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3416                                              dev->name);
3417                 }
3418         }
3419
3420         rc = -ENETDOWN;
3421         rcu_read_unlock_bh();
3422
3423         atomic_long_inc(&dev->tx_dropped);
3424         kfree_skb_list(skb);
3425         return rc;
3426 out:
3427         rcu_read_unlock_bh();
3428         return rc;
3429 }
3430
3431 int dev_queue_xmit(struct sk_buff *skb)
3432 {
3433         return __dev_queue_xmit(skb, NULL);
3434 }
3435 EXPORT_SYMBOL(dev_queue_xmit);
3436
3437 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3438 {
3439         return __dev_queue_xmit(skb, accel_priv);
3440 }
3441 EXPORT_SYMBOL(dev_queue_xmit_accel);
3442
3443
3444 /*=======================================================================
3445                         Receiver routines
3446   =======================================================================*/
3447
3448 int netdev_max_backlog __read_mostly = 1000;
3449 EXPORT_SYMBOL(netdev_max_backlog);
3450
3451 int netdev_tstamp_prequeue __read_mostly = 1;
3452 int netdev_budget __read_mostly = 300;
3453 int weight_p __read_mostly = 64;            /* old backlog weight */
3454
3455 /* Called with irq disabled */
3456 static inline void ____napi_schedule(struct softnet_data *sd,
3457                                      struct napi_struct *napi)
3458 {
3459         list_add_tail(&napi->poll_list, &sd->poll_list);
3460         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3461 }
3462
3463 #ifdef CONFIG_RPS
3464
3465 /* One global table that all flow-based protocols share. */
3466 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3467 EXPORT_SYMBOL(rps_sock_flow_table);
3468 u32 rps_cpu_mask __read_mostly;
3469 EXPORT_SYMBOL(rps_cpu_mask);
3470
3471 struct static_key rps_needed __read_mostly;
3472
3473 static struct rps_dev_flow *
3474 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3475             struct rps_dev_flow *rflow, u16 next_cpu)
3476 {
3477         if (next_cpu < nr_cpu_ids) {
3478 #ifdef CONFIG_RFS_ACCEL
3479                 struct netdev_rx_queue *rxqueue;
3480                 struct rps_dev_flow_table *flow_table;
3481                 struct rps_dev_flow *old_rflow;
3482                 u32 flow_id;
3483                 u16 rxq_index;
3484                 int rc;
3485
3486                 /* Should we steer this flow to a different hardware queue? */
3487                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3488                     !(dev->features & NETIF_F_NTUPLE))
3489                         goto out;
3490                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3491                 if (rxq_index == skb_get_rx_queue(skb))
3492                         goto out;
3493
3494                 rxqueue = dev->_rx + rxq_index;
3495                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3496                 if (!flow_table)
3497                         goto out;
3498                 flow_id = skb_get_hash(skb) & flow_table->mask;
3499                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3500                                                         rxq_index, flow_id);
3501                 if (rc < 0)
3502                         goto out;
3503                 old_rflow = rflow;
3504                 rflow = &flow_table->flows[flow_id];
3505                 rflow->filter = rc;
3506                 if (old_rflow->filter == rflow->filter)
3507                         old_rflow->filter = RPS_NO_FILTER;
3508         out:
3509 #endif
3510                 rflow->last_qtail =
3511                         per_cpu(softnet_data, next_cpu).input_queue_head;
3512         }
3513
3514         rflow->cpu = next_cpu;
3515         return rflow;
3516 }
3517
3518 /*
3519  * get_rps_cpu is called from netif_receive_skb and returns the target
3520  * CPU from the RPS map of the receiving queue for a given skb.
3521  * rcu_read_lock must be held on entry.
3522  */
3523 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3524                        struct rps_dev_flow **rflowp)
3525 {
3526         const struct rps_sock_flow_table *sock_flow_table;
3527         struct netdev_rx_queue *rxqueue = dev->_rx;
3528         struct rps_dev_flow_table *flow_table;
3529         struct rps_map *map;
3530         int cpu = -1;
3531         u32 tcpu;
3532         u32 hash;
3533
3534         if (skb_rx_queue_recorded(skb)) {
3535                 u16 index = skb_get_rx_queue(skb);
3536
3537                 if (unlikely(index >= dev->real_num_rx_queues)) {
3538                         WARN_ONCE(dev->real_num_rx_queues > 1,
3539                                   "%s received packet on queue %u, but number "
3540                                   "of RX queues is %u\n",
3541                                   dev->name, index, dev->real_num_rx_queues);
3542                         goto done;
3543                 }
3544                 rxqueue += index;
3545         }
3546
3547         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3548
3549         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3550         map = rcu_dereference(rxqueue->rps_map);
3551         if (!flow_table && !map)
3552                 goto done;
3553
3554         skb_reset_network_header(skb);
3555         hash = skb_get_hash(skb);
3556         if (!hash)
3557                 goto done;
3558
3559         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3560         if (flow_table && sock_flow_table) {
3561                 struct rps_dev_flow *rflow;
3562                 u32 next_cpu;
3563                 u32 ident;
3564
3565                 /* First check into global flow table if there is a match */
3566                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3567                 if ((ident ^ hash) & ~rps_cpu_mask)
3568                         goto try_rps;
3569
3570                 next_cpu = ident & rps_cpu_mask;
3571
3572                 /* OK, now we know there is a match,
3573                  * we can look at the local (per receive queue) flow table
3574                  */
3575                 rflow = &flow_table->flows[hash & flow_table->mask];
3576                 tcpu = rflow->cpu;
3577
3578                 /*
3579                  * If the desired CPU (where last recvmsg was done) is
3580                  * different from current CPU (one in the rx-queue flow
3581                  * table entry), switch if one of the following holds:
3582                  *   - Current CPU is unset (>= nr_cpu_ids).
3583                  *   - Current CPU is offline.
3584                  *   - The current CPU's queue tail has advanced beyond the
3585                  *     last packet that was enqueued using this table entry.
3586                  *     This guarantees that all previous packets for the flow
3587                  *     have been dequeued, thus preserving in order delivery.
3588                  */
3589                 if (unlikely(tcpu != next_cpu) &&
3590                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3591                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3592                       rflow->last_qtail)) >= 0)) {
3593                         tcpu = next_cpu;
3594                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3595                 }
3596
3597                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3598                         *rflowp = rflow;
3599                         cpu = tcpu;
3600                         goto done;
3601                 }
3602         }
3603
3604 try_rps:
3605
3606         if (map) {
3607                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3608                 if (cpu_online(tcpu)) {
3609                         cpu = tcpu;
3610                         goto done;
3611                 }
3612         }
3613
3614 done:
3615         return cpu;
3616 }
3617
3618 #ifdef CONFIG_RFS_ACCEL
3619
3620 /**
3621  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3622  * @dev: Device on which the filter was set
3623  * @rxq_index: RX queue index
3624  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3625  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3626  *
3627  * Drivers that implement ndo_rx_flow_steer() should periodically call
3628  * this function for each installed filter and remove the filters for
3629  * which it returns %true.
3630  */
3631 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3632                          u32 flow_id, u16 filter_id)
3633 {
3634         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3635         struct rps_dev_flow_table *flow_table;
3636         struct rps_dev_flow *rflow;
3637         bool expire = true;
3638         unsigned int cpu;
3639
3640         rcu_read_lock();
3641         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3642         if (flow_table && flow_id <= flow_table->mask) {
3643                 rflow = &flow_table->flows[flow_id];
3644                 cpu = ACCESS_ONCE(rflow->cpu);
3645                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3646                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3647                            rflow->last_qtail) <
3648                      (int)(10 * flow_table->mask)))
3649                         expire = false;
3650         }
3651         rcu_read_unlock();
3652         return expire;
3653 }
3654 EXPORT_SYMBOL(rps_may_expire_flow);
3655
3656 #endif /* CONFIG_RFS_ACCEL */
3657
3658 /* Called from hardirq (IPI) context */
3659 static void rps_trigger_softirq(void *data)
3660 {
3661         struct softnet_data *sd = data;
3662
3663         ____napi_schedule(sd, &sd->backlog);
3664         sd->received_rps++;
3665 }
3666
3667 #endif /* CONFIG_RPS */
3668
3669 /*
3670  * Check if this softnet_data structure is another cpu one
3671  * If yes, queue it to our IPI list and return 1
3672  * If no, return 0
3673  */
3674 static int rps_ipi_queued(struct softnet_data *sd)
3675 {
3676 #ifdef CONFIG_RPS
3677         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3678
3679         if (sd != mysd) {
3680                 sd->rps_ipi_next = mysd->rps_ipi_list;
3681                 mysd->rps_ipi_list = sd;
3682
3683                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3684                 return 1;
3685         }
3686 #endif /* CONFIG_RPS */
3687         return 0;
3688 }
3689
3690 #ifdef CONFIG_NET_FLOW_LIMIT
3691 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3692 #endif
3693
3694 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3695 {
3696 #ifdef CONFIG_NET_FLOW_LIMIT
3697         struct sd_flow_limit *fl;
3698         struct softnet_data *sd;
3699         unsigned int old_flow, new_flow;
3700
3701         if (qlen < (netdev_max_backlog >> 1))
3702                 return false;
3703
3704         sd = this_cpu_ptr(&softnet_data);
3705
3706         rcu_read_lock();
3707         fl = rcu_dereference(sd->flow_limit);
3708         if (fl) {
3709                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3710                 old_flow = fl->history[fl->history_head];
3711                 fl->history[fl->history_head] = new_flow;
3712
3713                 fl->history_head++;
3714                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3715
3716                 if (likely(fl->buckets[old_flow]))
3717                         fl->buckets[old_flow]--;
3718
3719                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3720                         fl->count++;
3721                         rcu_read_unlock();
3722                         return true;
3723                 }
3724         }
3725         rcu_read_unlock();
3726 #endif
3727         return false;
3728 }
3729
3730 /*
3731  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3732  * queue (may be a remote CPU queue).
3733  */
3734 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3735                               unsigned int *qtail)
3736 {
3737         struct softnet_data *sd;
3738         unsigned long flags;
3739         unsigned int qlen;
3740
3741         sd = &per_cpu(softnet_data, cpu);
3742
3743         local_irq_save(flags);
3744
3745         rps_lock(sd);
3746         if (!netif_running(skb->dev))
3747                 goto drop;
3748         qlen = skb_queue_len(&sd->input_pkt_queue);
3749         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3750                 if (qlen) {
3751 enqueue:
3752                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3753                         input_queue_tail_incr_save(sd, qtail);
3754                         rps_unlock(sd);
3755                         local_irq_restore(flags);
3756                         return NET_RX_SUCCESS;
3757                 }
3758
3759                 /* Schedule NAPI for backlog device
3760                  * We can use non atomic operation since we own the queue lock
3761                  */
3762                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3763                         if (!rps_ipi_queued(sd))
3764                                 ____napi_schedule(sd, &sd->backlog);
3765                 }
3766                 goto enqueue;
3767         }
3768
3769 drop:
3770         sd->dropped++;
3771         rps_unlock(sd);
3772
3773         local_irq_restore(flags);
3774
3775         atomic_long_inc(&skb->dev->rx_dropped);
3776         kfree_skb(skb);
3777         return NET_RX_DROP;
3778 }
3779
3780 static int netif_rx_internal(struct sk_buff *skb)
3781 {
3782         int ret;
3783
3784         net_timestamp_check(netdev_tstamp_prequeue, skb);
3785
3786         trace_netif_rx(skb);
3787 #ifdef CONFIG_RPS
3788         if (static_key_false(&rps_needed)) {
3789                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3790                 int cpu;
3791
3792                 preempt_disable();
3793                 rcu_read_lock();
3794
3795                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3796                 if (cpu < 0)
3797                         cpu = smp_processor_id();
3798
3799                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3800
3801                 rcu_read_unlock();
3802                 preempt_enable();
3803         } else
3804 #endif
3805         {
3806                 unsigned int qtail;
3807                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3808                 put_cpu();
3809         }
3810         return ret;
3811 }
3812
3813 /**
3814  *      netif_rx        -       post buffer to the network code
3815  *      @skb: buffer to post
3816  *
3817  *      This function receives a packet from a device driver and queues it for
3818  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3819  *      may be dropped during processing for congestion control or by the
3820  *      protocol layers.
3821  *
3822  *      return values:
3823  *      NET_RX_SUCCESS  (no congestion)
3824  *      NET_RX_DROP     (packet was dropped)
3825  *
3826  */
3827
3828 int netif_rx(struct sk_buff *skb)
3829 {
3830         trace_netif_rx_entry(skb);
3831
3832         return netif_rx_internal(skb);
3833 }
3834 EXPORT_SYMBOL(netif_rx);
3835
3836 int netif_rx_ni(struct sk_buff *skb)
3837 {
3838         int err;
3839
3840         trace_netif_rx_ni_entry(skb);
3841
3842         preempt_disable();
3843         err = netif_rx_internal(skb);
3844         if (local_softirq_pending())
3845                 do_softirq();
3846         preempt_enable();
3847
3848         return err;
3849 }
3850 EXPORT_SYMBOL(netif_rx_ni);
3851
3852 static void net_tx_action(struct softirq_action *h)
3853 {
3854         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3855
3856         if (sd->completion_queue) {
3857                 struct sk_buff *clist;
3858
3859                 local_irq_disable();
3860                 clist = sd->completion_queue;
3861                 sd->completion_queue = NULL;
3862                 local_irq_enable();
3863
3864                 while (clist) {
3865                         struct sk_buff *skb = clist;
3866                         clist = clist->next;
3867
3868                         WARN_ON(atomic_read(&skb->users));
3869                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3870                                 trace_consume_skb(skb);
3871                         else
3872                                 trace_kfree_skb(skb, net_tx_action);
3873
3874                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3875                                 __kfree_skb(skb);
3876                         else
3877                                 __kfree_skb_defer(skb);
3878                 }
3879
3880                 __kfree_skb_flush();
3881         }
3882
3883         if (sd->output_queue) {
3884                 struct Qdisc *head;
3885
3886                 local_irq_disable();
3887                 head = sd->output_queue;
3888                 sd->output_queue = NULL;
3889                 sd->output_queue_tailp = &sd->output_queue;
3890                 local_irq_enable();
3891
3892                 while (head) {
3893                         struct Qdisc *q = head;
3894                         spinlock_t *root_lock;
3895
3896                         head = head->next_sched;
3897
3898                         root_lock = qdisc_lock(q);
3899                         if (spin_trylock(root_lock)) {
3900                                 smp_mb__before_atomic();
3901                                 clear_bit(__QDISC_STATE_SCHED,
3902                                           &q->state);
3903                                 qdisc_run(q);
3904                                 spin_unlock(root_lock);
3905                         } else {
3906                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3907                                               &q->state)) {
3908                                         __netif_reschedule(q);
3909                                 } else {
3910                                         smp_mb__before_atomic();
3911                                         clear_bit(__QDISC_STATE_SCHED,
3912                                                   &q->state);
3913                                 }
3914                         }
3915                 }
3916         }
3917 }
3918
3919 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3920     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3921 /* This hook is defined here for ATM LANE */
3922 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3923                              unsigned char *addr) __read_mostly;
3924 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3925 #endif
3926
3927 static inline struct sk_buff *
3928 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3929                    struct net_device *orig_dev)
3930 {
3931 #ifdef CONFIG_NET_CLS_ACT
3932         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3933         struct tcf_result cl_res;
3934
3935         /* If there's at least one ingress present somewhere (so
3936          * we get here via enabled static key), remaining devices
3937          * that are not configured with an ingress qdisc will bail
3938          * out here.
3939          */
3940         if (!cl)
3941                 return skb;
3942         if (*pt_prev) {
3943                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3944                 *pt_prev = NULL;
3945         }
3946
3947         qdisc_skb_cb(skb)->pkt_len = skb->len;
3948         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3949         qdisc_bstats_cpu_update(cl->q, skb);
3950
3951         switch (tc_classify(skb, cl, &cl_res, false)) {
3952         case TC_ACT_OK:
3953         case TC_ACT_RECLASSIFY:
3954                 skb->tc_index = TC_H_MIN(cl_res.classid);
3955                 break;
3956         case TC_ACT_SHOT:
3957                 qdisc_qstats_cpu_drop(cl->q);
3958         case TC_ACT_STOLEN:
3959         case TC_ACT_QUEUED:
3960                 kfree_skb(skb);
3961                 return NULL;
3962         case TC_ACT_REDIRECT:
3963                 /* skb_mac_header check was done by cls/act_bpf, so
3964                  * we can safely push the L2 header back before
3965                  * redirecting to another netdev
3966                  */
3967                 __skb_push(skb, skb->mac_len);
3968                 skb_do_redirect(skb);
3969                 return NULL;
3970         default:
3971                 break;
3972         }
3973 #endif /* CONFIG_NET_CLS_ACT */
3974         return skb;
3975 }
3976
3977 /**
3978  *      netdev_rx_handler_register - register receive handler
3979  *      @dev: device to register a handler for
3980  *      @rx_handler: receive handler to register
3981  *      @rx_handler_data: data pointer that is used by rx handler
3982  *
3983  *      Register a receive handler for a device. This handler will then be
3984  *      called from __netif_receive_skb. A negative errno code is returned
3985  *      on a failure.
3986  *
3987  *      The caller must hold the rtnl_mutex.
3988  *
3989  *      For a general description of rx_handler, see enum rx_handler_result.
3990  */
3991 int netdev_rx_handler_register(struct net_device *dev,
3992                                rx_handler_func_t *rx_handler,
3993                                void *rx_handler_data)
3994 {
3995         ASSERT_RTNL();
3996
3997         if (dev->rx_handler)
3998                 return -EBUSY;
3999
4000         /* Note: rx_handler_data must be set before rx_handler */
4001         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4002         rcu_assign_pointer(dev->rx_handler, rx_handler);
4003
4004         return 0;
4005 }
4006 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4007
4008 /**
4009  *      netdev_rx_handler_unregister - unregister receive handler
4010  *      @dev: device to unregister a handler from
4011  *
4012  *      Unregister a receive handler from a device.
4013  *
4014  *      The caller must hold the rtnl_mutex.
4015  */
4016 void netdev_rx_handler_unregister(struct net_device *dev)
4017 {
4018
4019         ASSERT_RTNL();
4020         RCU_INIT_POINTER(dev->rx_handler, NULL);
4021         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4022          * section has a guarantee to see a non NULL rx_handler_data
4023          * as well.
4024          */
4025         synchronize_net();
4026         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4027 }
4028 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4029
4030 /*
4031  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4032  * the special handling of PFMEMALLOC skbs.
4033  */
4034 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4035 {
4036         switch (skb->protocol) {
4037         case htons(ETH_P_ARP):
4038         case htons(ETH_P_IP):
4039         case htons(ETH_P_IPV6):
4040         case htons(ETH_P_8021Q):
4041         case htons(ETH_P_8021AD):
4042                 return true;
4043         default:
4044                 return false;
4045         }
4046 }
4047
4048 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4049                              int *ret, struct net_device *orig_dev)
4050 {
4051 #ifdef CONFIG_NETFILTER_INGRESS
4052         if (nf_hook_ingress_active(skb)) {
4053                 if (*pt_prev) {
4054                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4055                         *pt_prev = NULL;
4056                 }
4057
4058                 return nf_hook_ingress(skb);
4059         }
4060 #endif /* CONFIG_NETFILTER_INGRESS */
4061         return 0;
4062 }
4063
4064 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4065 {
4066         struct packet_type *ptype, *pt_prev;
4067         rx_handler_func_t *rx_handler;
4068         struct net_device *orig_dev;
4069         bool deliver_exact = false;
4070         int ret = NET_RX_DROP;
4071         __be16 type;
4072
4073         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4074
4075         trace_netif_receive_skb(skb);
4076
4077         orig_dev = skb->dev;
4078
4079         skb_reset_network_header(skb);
4080         if (!skb_transport_header_was_set(skb))
4081                 skb_reset_transport_header(skb);
4082         skb_reset_mac_len(skb);
4083
4084         pt_prev = NULL;
4085
4086 another_round:
4087         skb->skb_iif = skb->dev->ifindex;
4088
4089         __this_cpu_inc(softnet_data.processed);
4090
4091         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4092             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4093                 skb = skb_vlan_untag(skb);
4094                 if (unlikely(!skb))
4095                         goto out;
4096         }
4097
4098 #ifdef CONFIG_NET_CLS_ACT
4099         if (skb->tc_verd & TC_NCLS) {
4100                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4101                 goto ncls;
4102         }
4103 #endif
4104
4105         if (pfmemalloc)
4106                 goto skip_taps;
4107
4108         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4109                 if (pt_prev)
4110                         ret = deliver_skb(skb, pt_prev, orig_dev);
4111                 pt_prev = ptype;
4112         }
4113
4114         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4115                 if (pt_prev)
4116                         ret = deliver_skb(skb, pt_prev, orig_dev);
4117                 pt_prev = ptype;
4118         }
4119
4120 skip_taps:
4121 #ifdef CONFIG_NET_INGRESS
4122         if (static_key_false(&ingress_needed)) {
4123                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4124                 if (!skb)
4125                         goto out;
4126
4127                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4128                         goto out;
4129         }
4130 #endif
4131 #ifdef CONFIG_NET_CLS_ACT
4132         skb->tc_verd = 0;
4133 ncls:
4134 #endif
4135         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4136                 goto drop;
4137
4138         if (skb_vlan_tag_present(skb)) {
4139                 if (pt_prev) {
4140                         ret = deliver_skb(skb, pt_prev, orig_dev);
4141                         pt_prev = NULL;
4142                 }
4143                 if (vlan_do_receive(&skb))
4144                         goto another_round;
4145                 else if (unlikely(!skb))
4146                         goto out;
4147         }
4148
4149         rx_handler = rcu_dereference(skb->dev->rx_handler);
4150         if (rx_handler) {
4151                 if (pt_prev) {
4152                         ret = deliver_skb(skb, pt_prev, orig_dev);
4153                         pt_prev = NULL;
4154                 }
4155                 switch (rx_handler(&skb)) {
4156                 case RX_HANDLER_CONSUMED:
4157                         ret = NET_RX_SUCCESS;
4158                         goto out;
4159                 case RX_HANDLER_ANOTHER:
4160                         goto another_round;
4161                 case RX_HANDLER_EXACT:
4162                         deliver_exact = true;
4163                 case RX_HANDLER_PASS:
4164                         break;
4165                 default:
4166                         BUG();
4167                 }
4168         }
4169
4170         if (unlikely(skb_vlan_tag_present(skb))) {
4171                 if (skb_vlan_tag_get_id(skb))
4172                         skb->pkt_type = PACKET_OTHERHOST;
4173                 /* Note: we might in the future use prio bits
4174                  * and set skb->priority like in vlan_do_receive()
4175                  * For the time being, just ignore Priority Code Point
4176                  */
4177                 skb->vlan_tci = 0;
4178         }
4179
4180         type = skb->protocol;
4181
4182         /* deliver only exact match when indicated */
4183         if (likely(!deliver_exact)) {
4184                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4185                                        &ptype_base[ntohs(type) &
4186                                                    PTYPE_HASH_MASK]);
4187         }
4188
4189         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4190                                &orig_dev->ptype_specific);
4191
4192         if (unlikely(skb->dev != orig_dev)) {
4193                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4194                                        &skb->dev->ptype_specific);
4195         }
4196
4197         if (pt_prev) {
4198                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4199                         goto drop;
4200                 else
4201                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4202         } else {
4203 drop:
4204                 if (!deliver_exact)
4205                         atomic_long_inc(&skb->dev->rx_dropped);
4206                 else
4207                         atomic_long_inc(&skb->dev->rx_nohandler);
4208                 kfree_skb(skb);
4209                 /* Jamal, now you will not able to escape explaining
4210                  * me how you were going to use this. :-)
4211                  */
4212                 ret = NET_RX_DROP;
4213         }
4214
4215 out:
4216         return ret;
4217 }
4218
4219 static int __netif_receive_skb(struct sk_buff *skb)
4220 {
4221         int ret;
4222
4223         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4224                 unsigned long pflags = current->flags;
4225
4226                 /*
4227                  * PFMEMALLOC skbs are special, they should
4228                  * - be delivered to SOCK_MEMALLOC sockets only
4229                  * - stay away from userspace
4230                  * - have bounded memory usage
4231                  *
4232                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4233                  * context down to all allocation sites.
4234                  */
4235                 current->flags |= PF_MEMALLOC;
4236                 ret = __netif_receive_skb_core(skb, true);
4237                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4238         } else
4239                 ret = __netif_receive_skb_core(skb, false);
4240
4241         return ret;
4242 }
4243
4244 static int netif_receive_skb_internal(struct sk_buff *skb)
4245 {
4246         int ret;
4247
4248         net_timestamp_check(netdev_tstamp_prequeue, skb);
4249
4250         if (skb_defer_rx_timestamp(skb))
4251                 return NET_RX_SUCCESS;
4252
4253         rcu_read_lock();
4254
4255 #ifdef CONFIG_RPS
4256         if (static_key_false(&rps_needed)) {
4257                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4258                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4259
4260                 if (cpu >= 0) {
4261                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4262                         rcu_read_unlock();
4263                         return ret;
4264                 }
4265         }
4266 #endif
4267         ret = __netif_receive_skb(skb);
4268         rcu_read_unlock();
4269         return ret;
4270 }
4271
4272 /**
4273  *      netif_receive_skb - process receive buffer from network
4274  *      @skb: buffer to process
4275  *
4276  *      netif_receive_skb() is the main receive data processing function.
4277  *      It always succeeds. The buffer may be dropped during processing
4278  *      for congestion control or by the protocol layers.
4279  *
4280  *      This function may only be called from softirq context and interrupts
4281  *      should be enabled.
4282  *
4283  *      Return values (usually ignored):
4284  *      NET_RX_SUCCESS: no congestion
4285  *      NET_RX_DROP: packet was dropped
4286  */
4287 int netif_receive_skb(struct sk_buff *skb)
4288 {
4289         trace_netif_receive_skb_entry(skb);
4290
4291         return netif_receive_skb_internal(skb);
4292 }
4293 EXPORT_SYMBOL(netif_receive_skb);
4294
4295 /* Network device is going away, flush any packets still pending
4296  * Called with irqs disabled.
4297  */
4298 static void flush_backlog(void *arg)
4299 {
4300         struct net_device *dev = arg;
4301         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4302         struct sk_buff *skb, *tmp;
4303
4304         rps_lock(sd);
4305         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4306                 if (skb->dev == dev) {
4307                         __skb_unlink(skb, &sd->input_pkt_queue);
4308                         kfree_skb(skb);
4309                         input_queue_head_incr(sd);
4310                 }
4311         }
4312         rps_unlock(sd);
4313
4314         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4315                 if (skb->dev == dev) {
4316                         __skb_unlink(skb, &sd->process_queue);
4317                         kfree_skb(skb);
4318                         input_queue_head_incr(sd);
4319                 }
4320         }
4321 }
4322
4323 static int napi_gro_complete(struct sk_buff *skb)
4324 {
4325         struct packet_offload *ptype;
4326         __be16 type = skb->protocol;
4327         struct list_head *head = &offload_base;
4328         int err = -ENOENT;
4329
4330         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4331
4332         if (NAPI_GRO_CB(skb)->count == 1) {
4333                 skb_shinfo(skb)->gso_size = 0;
4334                 goto out;
4335         }
4336
4337         rcu_read_lock();
4338         list_for_each_entry_rcu(ptype, head, list) {
4339                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4340                         continue;
4341
4342                 err = ptype->callbacks.gro_complete(skb, 0);
4343                 break;
4344         }
4345         rcu_read_unlock();
4346
4347         if (err) {
4348                 WARN_ON(&ptype->list == head);
4349                 kfree_skb(skb);
4350                 return NET_RX_SUCCESS;
4351         }
4352
4353 out:
4354         return netif_receive_skb_internal(skb);
4355 }
4356
4357 /* napi->gro_list contains packets ordered by age.
4358  * youngest packets at the head of it.
4359  * Complete skbs in reverse order to reduce latencies.
4360  */
4361 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4362 {
4363         struct sk_buff *skb, *prev = NULL;
4364
4365         /* scan list and build reverse chain */
4366         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4367                 skb->prev = prev;
4368                 prev = skb;
4369         }
4370
4371         for (skb = prev; skb; skb = prev) {
4372                 skb->next = NULL;
4373
4374                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4375                         return;
4376
4377                 prev = skb->prev;
4378                 napi_gro_complete(skb);
4379                 napi->gro_count--;
4380         }
4381
4382         napi->gro_list = NULL;
4383 }
4384 EXPORT_SYMBOL(napi_gro_flush);
4385
4386 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4387 {
4388         struct sk_buff *p;
4389         unsigned int maclen = skb->dev->hard_header_len;
4390         u32 hash = skb_get_hash_raw(skb);
4391
4392         for (p = napi->gro_list; p; p = p->next) {
4393                 unsigned long diffs;
4394
4395                 NAPI_GRO_CB(p)->flush = 0;
4396
4397                 if (hash != skb_get_hash_raw(p)) {
4398                         NAPI_GRO_CB(p)->same_flow = 0;
4399                         continue;
4400                 }
4401
4402                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4403                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4404                 diffs |= skb_metadata_dst_cmp(p, skb);
4405                 if (maclen == ETH_HLEN)
4406                         diffs |= compare_ether_header(skb_mac_header(p),
4407                                                       skb_mac_header(skb));
4408                 else if (!diffs)
4409                         diffs = memcmp(skb_mac_header(p),
4410                                        skb_mac_header(skb),
4411                                        maclen);
4412                 NAPI_GRO_CB(p)->same_flow = !diffs;
4413         }
4414 }
4415
4416 static void skb_gro_reset_offset(struct sk_buff *skb)
4417 {
4418         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4419         const skb_frag_t *frag0 = &pinfo->frags[0];
4420
4421         NAPI_GRO_CB(skb)->data_offset = 0;
4422         NAPI_GRO_CB(skb)->frag0 = NULL;
4423         NAPI_GRO_CB(skb)->frag0_len = 0;
4424
4425         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4426             pinfo->nr_frags &&
4427             !PageHighMem(skb_frag_page(frag0))) {
4428                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4429                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4430         }
4431 }
4432
4433 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4434 {
4435         struct skb_shared_info *pinfo = skb_shinfo(skb);
4436
4437         BUG_ON(skb->end - skb->tail < grow);
4438
4439         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4440
4441         skb->data_len -= grow;
4442         skb->tail += grow;
4443
4444         pinfo->frags[0].page_offset += grow;
4445         skb_frag_size_sub(&pinfo->frags[0], grow);
4446
4447         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4448                 skb_frag_unref(skb, 0);
4449                 memmove(pinfo->frags, pinfo->frags + 1,
4450                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4451         }
4452 }
4453
4454 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4455 {
4456         struct sk_buff **pp = NULL;
4457         struct packet_offload *ptype;
4458         __be16 type = skb->protocol;
4459         struct list_head *head = &offload_base;
4460         int same_flow;
4461         enum gro_result ret;
4462         int grow;
4463
4464         if (!(skb->dev->features & NETIF_F_GRO))
4465                 goto normal;
4466
4467         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4468                 goto normal;
4469
4470         gro_list_prepare(napi, skb);
4471
4472         rcu_read_lock();
4473         list_for_each_entry_rcu(ptype, head, list) {
4474                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4475                         continue;
4476
4477                 skb_set_network_header(skb, skb_gro_offset(skb));
4478                 skb_reset_mac_len(skb);
4479                 NAPI_GRO_CB(skb)->same_flow = 0;
4480                 NAPI_GRO_CB(skb)->flush = 0;
4481                 NAPI_GRO_CB(skb)->free = 0;
4482                 NAPI_GRO_CB(skb)->encap_mark = 0;
4483                 NAPI_GRO_CB(skb)->is_fou = 0;
4484                 NAPI_GRO_CB(skb)->is_atomic = 1;
4485                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4486
4487                 /* Setup for GRO checksum validation */
4488                 switch (skb->ip_summed) {
4489                 case CHECKSUM_COMPLETE:
4490                         NAPI_GRO_CB(skb)->csum = skb->csum;
4491                         NAPI_GRO_CB(skb)->csum_valid = 1;
4492                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4493                         break;
4494                 case CHECKSUM_UNNECESSARY:
4495                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4496                         NAPI_GRO_CB(skb)->csum_valid = 0;
4497                         break;
4498                 default:
4499                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4500                         NAPI_GRO_CB(skb)->csum_valid = 0;
4501                 }
4502
4503                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4504                 break;
4505         }
4506         rcu_read_unlock();
4507
4508         if (&ptype->list == head)
4509                 goto normal;
4510
4511         same_flow = NAPI_GRO_CB(skb)->same_flow;
4512         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4513
4514         if (pp) {
4515                 struct sk_buff *nskb = *pp;
4516
4517                 *pp = nskb->next;
4518                 nskb->next = NULL;
4519                 napi_gro_complete(nskb);
4520                 napi->gro_count--;
4521         }
4522
4523         if (same_flow)
4524                 goto ok;
4525
4526         if (NAPI_GRO_CB(skb)->flush)
4527                 goto normal;
4528
4529         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4530                 struct sk_buff *nskb = napi->gro_list;
4531
4532                 /* locate the end of the list to select the 'oldest' flow */
4533                 while (nskb->next) {
4534                         pp = &nskb->next;
4535                         nskb = *pp;
4536                 }
4537                 *pp = NULL;
4538                 nskb->next = NULL;
4539                 napi_gro_complete(nskb);
4540         } else {
4541                 napi->gro_count++;
4542         }
4543         NAPI_GRO_CB(skb)->count = 1;
4544         NAPI_GRO_CB(skb)->age = jiffies;
4545         NAPI_GRO_CB(skb)->last = skb;
4546         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4547         skb->next = napi->gro_list;
4548         napi->gro_list = skb;
4549         ret = GRO_HELD;
4550
4551 pull:
4552         grow = skb_gro_offset(skb) - skb_headlen(skb);
4553         if (grow > 0)
4554                 gro_pull_from_frag0(skb, grow);
4555 ok:
4556         return ret;
4557
4558 normal:
4559         ret = GRO_NORMAL;
4560         goto pull;
4561 }
4562
4563 struct packet_offload *gro_find_receive_by_type(__be16 type)
4564 {
4565         struct list_head *offload_head = &offload_base;
4566         struct packet_offload *ptype;
4567
4568         list_for_each_entry_rcu(ptype, offload_head, list) {
4569                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4570                         continue;
4571                 return ptype;
4572         }
4573         return NULL;
4574 }
4575 EXPORT_SYMBOL(gro_find_receive_by_type);
4576
4577 struct packet_offload *gro_find_complete_by_type(__be16 type)
4578 {
4579         struct list_head *offload_head = &offload_base;
4580         struct packet_offload *ptype;
4581
4582         list_for_each_entry_rcu(ptype, offload_head, list) {
4583                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4584                         continue;
4585                 return ptype;
4586         }
4587         return NULL;
4588 }
4589 EXPORT_SYMBOL(gro_find_complete_by_type);
4590
4591 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4592 {
4593         switch (ret) {
4594         case GRO_NORMAL:
4595                 if (netif_receive_skb_internal(skb))
4596                         ret = GRO_DROP;
4597                 break;
4598
4599         case GRO_DROP:
4600                 kfree_skb(skb);
4601                 break;
4602
4603         case GRO_MERGED_FREE:
4604                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4605                         skb_dst_drop(skb);
4606                         kmem_cache_free(skbuff_head_cache, skb);
4607                 } else {
4608                         __kfree_skb(skb);
4609                 }
4610                 break;
4611
4612         case GRO_HELD:
4613         case GRO_MERGED:
4614                 break;
4615         }
4616
4617         return ret;
4618 }
4619
4620 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4621 {
4622         skb_mark_napi_id(skb, napi);
4623         trace_napi_gro_receive_entry(skb);
4624
4625         skb_gro_reset_offset(skb);
4626
4627         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4628 }
4629 EXPORT_SYMBOL(napi_gro_receive);
4630
4631 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4632 {
4633         if (unlikely(skb->pfmemalloc)) {
4634                 consume_skb(skb);
4635                 return;
4636         }
4637         __skb_pull(skb, skb_headlen(skb));
4638         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4639         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4640         skb->vlan_tci = 0;
4641         skb->dev = napi->dev;
4642         skb->skb_iif = 0;
4643         skb->encapsulation = 0;
4644         skb_shinfo(skb)->gso_type = 0;
4645         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4646
4647         napi->skb = skb;
4648 }
4649
4650 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4651 {
4652         struct sk_buff *skb = napi->skb;
4653
4654         if (!skb) {
4655                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4656                 if (skb) {
4657                         napi->skb = skb;
4658                         skb_mark_napi_id(skb, napi);
4659                 }
4660         }
4661         return skb;
4662 }
4663 EXPORT_SYMBOL(napi_get_frags);
4664
4665 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4666                                       struct sk_buff *skb,
4667                                       gro_result_t ret)
4668 {
4669         switch (ret) {
4670         case GRO_NORMAL:
4671         case GRO_HELD:
4672                 __skb_push(skb, ETH_HLEN);
4673                 skb->protocol = eth_type_trans(skb, skb->dev);
4674                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4675                         ret = GRO_DROP;
4676                 break;
4677
4678         case GRO_DROP:
4679         case GRO_MERGED_FREE:
4680                 napi_reuse_skb(napi, skb);
4681                 break;
4682
4683         case GRO_MERGED:
4684                 break;
4685         }
4686
4687         return ret;
4688 }
4689
4690 /* Upper GRO stack assumes network header starts at gro_offset=0
4691  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4692  * We copy ethernet header into skb->data to have a common layout.
4693  */
4694 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4695 {
4696         struct sk_buff *skb = napi->skb;
4697         const struct ethhdr *eth;
4698         unsigned int hlen = sizeof(*eth);
4699
4700         napi->skb = NULL;
4701
4702         skb_reset_mac_header(skb);
4703         skb_gro_reset_offset(skb);
4704
4705         eth = skb_gro_header_fast(skb, 0);
4706         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4707                 eth = skb_gro_header_slow(skb, hlen, 0);
4708                 if (unlikely(!eth)) {
4709                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4710                                              __func__, napi->dev->name);
4711                         napi_reuse_skb(napi, skb);
4712                         return NULL;
4713                 }
4714         } else {
4715                 gro_pull_from_frag0(skb, hlen);
4716                 NAPI_GRO_CB(skb)->frag0 += hlen;
4717                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4718         }
4719         __skb_pull(skb, hlen);
4720
4721         /*
4722          * This works because the only protocols we care about don't require
4723          * special handling.
4724          * We'll fix it up properly in napi_frags_finish()
4725          */
4726         skb->protocol = eth->h_proto;
4727
4728         return skb;
4729 }
4730
4731 gro_result_t napi_gro_frags(struct napi_struct *napi)
4732 {
4733         struct sk_buff *skb = napi_frags_skb(napi);
4734
4735         if (!skb)
4736                 return GRO_DROP;
4737
4738         trace_napi_gro_frags_entry(skb);
4739
4740         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4741 }
4742 EXPORT_SYMBOL(napi_gro_frags);
4743
4744 /* Compute the checksum from gro_offset and return the folded value
4745  * after adding in any pseudo checksum.
4746  */
4747 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4748 {
4749         __wsum wsum;
4750         __sum16 sum;
4751
4752         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4753
4754         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4755         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4756         if (likely(!sum)) {
4757                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4758                     !skb->csum_complete_sw)
4759                         netdev_rx_csum_fault(skb->dev);
4760         }
4761
4762         NAPI_GRO_CB(skb)->csum = wsum;
4763         NAPI_GRO_CB(skb)->csum_valid = 1;
4764
4765         return sum;
4766 }
4767 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4768
4769 /*
4770  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4771  * Note: called with local irq disabled, but exits with local irq enabled.
4772  */
4773 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4774 {
4775 #ifdef CONFIG_RPS
4776         struct softnet_data *remsd = sd->rps_ipi_list;
4777
4778         if (remsd) {
4779                 sd->rps_ipi_list = NULL;
4780
4781                 local_irq_enable();
4782
4783                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4784                 while (remsd) {
4785                         struct softnet_data *next = remsd->rps_ipi_next;
4786
4787                         if (cpu_online(remsd->cpu))
4788                                 smp_call_function_single_async(remsd->cpu,
4789                                                            &remsd->csd);
4790                         remsd = next;
4791                 }
4792         } else
4793 #endif
4794                 local_irq_enable();
4795 }
4796
4797 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4798 {
4799 #ifdef CONFIG_RPS
4800         return sd->rps_ipi_list != NULL;
4801 #else
4802         return false;
4803 #endif
4804 }
4805
4806 static int process_backlog(struct napi_struct *napi, int quota)
4807 {
4808         int work = 0;
4809         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4810
4811         /* Check if we have pending ipi, its better to send them now,
4812          * not waiting net_rx_action() end.
4813          */
4814         if (sd_has_rps_ipi_waiting(sd)) {
4815                 local_irq_disable();
4816                 net_rps_action_and_irq_enable(sd);
4817         }
4818
4819         napi->weight = weight_p;
4820         local_irq_disable();
4821         while (1) {
4822                 struct sk_buff *skb;
4823
4824                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4825                         rcu_read_lock();
4826                         local_irq_enable();
4827                         __netif_receive_skb(skb);
4828                         rcu_read_unlock();
4829                         local_irq_disable();
4830                         input_queue_head_incr(sd);
4831                         if (++work >= quota) {
4832                                 local_irq_enable();
4833                                 return work;
4834                         }
4835                 }
4836
4837                 rps_lock(sd);
4838                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4839                         /*
4840                          * Inline a custom version of __napi_complete().
4841                          * only current cpu owns and manipulates this napi,
4842                          * and NAPI_STATE_SCHED is the only possible flag set
4843                          * on backlog.
4844                          * We can use a plain write instead of clear_bit(),
4845                          * and we dont need an smp_mb() memory barrier.
4846                          */
4847                         napi->state = 0;
4848                         rps_unlock(sd);
4849
4850                         break;
4851                 }
4852
4853                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4854                                            &sd->process_queue);
4855                 rps_unlock(sd);
4856         }
4857         local_irq_enable();
4858
4859         return work;
4860 }
4861
4862 /**
4863  * __napi_schedule - schedule for receive
4864  * @n: entry to schedule
4865  *
4866  * The entry's receive function will be scheduled to run.
4867  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4868  */
4869 void __napi_schedule(struct napi_struct *n)
4870 {
4871         unsigned long flags;
4872
4873         local_irq_save(flags);
4874         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4875         local_irq_restore(flags);
4876 }
4877 EXPORT_SYMBOL(__napi_schedule);
4878
4879 /**
4880  * __napi_schedule_irqoff - schedule for receive
4881  * @n: entry to schedule
4882  *
4883  * Variant of __napi_schedule() assuming hard irqs are masked
4884  */
4885 void __napi_schedule_irqoff(struct napi_struct *n)
4886 {
4887         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4888 }
4889 EXPORT_SYMBOL(__napi_schedule_irqoff);
4890
4891 void __napi_complete(struct napi_struct *n)
4892 {
4893         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4894
4895         list_del_init(&n->poll_list);
4896         smp_mb__before_atomic();
4897         clear_bit(NAPI_STATE_SCHED, &n->state);
4898 }
4899 EXPORT_SYMBOL(__napi_complete);
4900
4901 void napi_complete_done(struct napi_struct *n, int work_done)
4902 {
4903         unsigned long flags;
4904
4905         /*
4906          * don't let napi dequeue from the cpu poll list
4907          * just in case its running on a different cpu
4908          */
4909         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4910                 return;
4911
4912         if (n->gro_list) {
4913                 unsigned long timeout = 0;
4914
4915                 if (work_done)
4916                         timeout = n->dev->gro_flush_timeout;
4917
4918                 if (timeout)
4919                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4920                                       HRTIMER_MODE_REL_PINNED);
4921                 else
4922                         napi_gro_flush(n, false);
4923         }
4924         if (likely(list_empty(&n->poll_list))) {
4925                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4926         } else {
4927                 /* If n->poll_list is not empty, we need to mask irqs */
4928                 local_irq_save(flags);
4929                 __napi_complete(n);
4930                 local_irq_restore(flags);
4931         }
4932 }
4933 EXPORT_SYMBOL(napi_complete_done);
4934
4935 /* must be called under rcu_read_lock(), as we dont take a reference */
4936 static struct napi_struct *napi_by_id(unsigned int napi_id)
4937 {
4938         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4939         struct napi_struct *napi;
4940
4941         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4942                 if (napi->napi_id == napi_id)
4943                         return napi;
4944
4945         return NULL;
4946 }
4947
4948 #if defined(CONFIG_NET_RX_BUSY_POLL)
4949 #define BUSY_POLL_BUDGET 8
4950 bool sk_busy_loop(struct sock *sk, int nonblock)
4951 {
4952         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4953         int (*busy_poll)(struct napi_struct *dev);
4954         struct napi_struct *napi;
4955         int rc = false;
4956
4957         rcu_read_lock();
4958
4959         napi = napi_by_id(sk->sk_napi_id);
4960         if (!napi)
4961                 goto out;
4962
4963         /* Note: ndo_busy_poll method is optional in linux-4.5 */
4964         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4965
4966         do {
4967                 rc = 0;
4968                 local_bh_disable();
4969                 if (busy_poll) {
4970                         rc = busy_poll(napi);
4971                 } else if (napi_schedule_prep(napi)) {
4972                         void *have = netpoll_poll_lock(napi);
4973
4974                         if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4975                                 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4976                                 trace_napi_poll(napi);
4977                                 if (rc == BUSY_POLL_BUDGET) {
4978                                         napi_complete_done(napi, rc);
4979                                         napi_schedule(napi);
4980                                 }
4981                         }
4982                         netpoll_poll_unlock(have);
4983                 }
4984                 if (rc > 0)
4985                         NET_ADD_STATS_BH(sock_net(sk),
4986                                          LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4987                 local_bh_enable();
4988
4989                 if (rc == LL_FLUSH_FAILED)
4990                         break; /* permanent failure */
4991
4992                 cpu_relax();
4993         } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4994                  !need_resched() && !busy_loop_timeout(end_time));
4995
4996         rc = !skb_queue_empty(&sk->sk_receive_queue);
4997 out:
4998         rcu_read_unlock();
4999         return rc;
5000 }
5001 EXPORT_SYMBOL(sk_busy_loop);
5002
5003 #endif /* CONFIG_NET_RX_BUSY_POLL */
5004
5005 void napi_hash_add(struct napi_struct *napi)
5006 {
5007         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5008             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5009                 return;
5010
5011         spin_lock(&napi_hash_lock);
5012
5013         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5014         do {
5015                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5016                         napi_gen_id = NR_CPUS + 1;
5017         } while (napi_by_id(napi_gen_id));
5018         napi->napi_id = napi_gen_id;
5019
5020         hlist_add_head_rcu(&napi->napi_hash_node,
5021                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5022
5023         spin_unlock(&napi_hash_lock);
5024 }
5025 EXPORT_SYMBOL_GPL(napi_hash_add);
5026
5027 /* Warning : caller is responsible to make sure rcu grace period
5028  * is respected before freeing memory containing @napi
5029  */
5030 bool napi_hash_del(struct napi_struct *napi)
5031 {
5032         bool rcu_sync_needed = false;
5033
5034         spin_lock(&napi_hash_lock);
5035
5036         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5037                 rcu_sync_needed = true;
5038                 hlist_del_rcu(&napi->napi_hash_node);
5039         }
5040         spin_unlock(&napi_hash_lock);
5041         return rcu_sync_needed;
5042 }
5043 EXPORT_SYMBOL_GPL(napi_hash_del);
5044
5045 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5046 {
5047         struct napi_struct *napi;
5048
5049         napi = container_of(timer, struct napi_struct, timer);
5050         if (napi->gro_list)
5051                 napi_schedule(napi);
5052
5053         return HRTIMER_NORESTART;
5054 }
5055
5056 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5057                     int (*poll)(struct napi_struct *, int), int weight)
5058 {
5059         INIT_LIST_HEAD(&napi->poll_list);
5060         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5061         napi->timer.function = napi_watchdog;
5062         napi->gro_count = 0;
5063         napi->gro_list = NULL;
5064         napi->skb = NULL;
5065         napi->poll = poll;
5066         if (weight > NAPI_POLL_WEIGHT)
5067                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5068                             weight, dev->name);
5069         napi->weight = weight;
5070         list_add(&napi->dev_list, &dev->napi_list);
5071         napi->dev = dev;
5072 #ifdef CONFIG_NETPOLL
5073         spin_lock_init(&napi->poll_lock);
5074         napi->poll_owner = -1;
5075 #endif
5076         set_bit(NAPI_STATE_SCHED, &napi->state);
5077         napi_hash_add(napi);
5078 }
5079 EXPORT_SYMBOL(netif_napi_add);
5080
5081 void napi_disable(struct napi_struct *n)
5082 {
5083         might_sleep();
5084         set_bit(NAPI_STATE_DISABLE, &n->state);
5085
5086         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5087                 msleep(1);
5088         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5089                 msleep(1);
5090
5091         hrtimer_cancel(&n->timer);
5092
5093         clear_bit(NAPI_STATE_DISABLE, &n->state);
5094 }
5095 EXPORT_SYMBOL(napi_disable);
5096
5097 /* Must be called in process context */
5098 void netif_napi_del(struct napi_struct *napi)
5099 {
5100         might_sleep();
5101         if (napi_hash_del(napi))
5102                 synchronize_net();
5103         list_del_init(&napi->dev_list);
5104         napi_free_frags(napi);
5105
5106         kfree_skb_list(napi->gro_list);
5107         napi->gro_list = NULL;
5108         napi->gro_count = 0;
5109 }
5110 EXPORT_SYMBOL(netif_napi_del);
5111
5112 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5113 {
5114         void *have;
5115         int work, weight;
5116
5117         list_del_init(&n->poll_list);
5118
5119         have = netpoll_poll_lock(n);
5120
5121         weight = n->weight;
5122
5123         /* This NAPI_STATE_SCHED test is for avoiding a race
5124          * with netpoll's poll_napi().  Only the entity which
5125          * obtains the lock and sees NAPI_STATE_SCHED set will
5126          * actually make the ->poll() call.  Therefore we avoid
5127          * accidentally calling ->poll() when NAPI is not scheduled.
5128          */
5129         work = 0;
5130         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5131                 work = n->poll(n, weight);
5132                 trace_napi_poll(n);
5133         }
5134
5135         WARN_ON_ONCE(work > weight);
5136
5137         if (likely(work < weight))
5138                 goto out_unlock;
5139
5140         /* Drivers must not modify the NAPI state if they
5141          * consume the entire weight.  In such cases this code
5142          * still "owns" the NAPI instance and therefore can
5143          * move the instance around on the list at-will.
5144          */
5145         if (unlikely(napi_disable_pending(n))) {
5146                 napi_complete(n);
5147                 goto out_unlock;
5148         }
5149
5150         if (n->gro_list) {
5151                 /* flush too old packets
5152                  * If HZ < 1000, flush all packets.
5153                  */
5154                 napi_gro_flush(n, HZ >= 1000);
5155         }
5156
5157         /* Some drivers may have called napi_schedule
5158          * prior to exhausting their budget.
5159          */
5160         if (unlikely(!list_empty(&n->poll_list))) {
5161                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5162                              n->dev ? n->dev->name : "backlog");
5163                 goto out_unlock;
5164         }
5165
5166         list_add_tail(&n->poll_list, repoll);
5167
5168 out_unlock:
5169         netpoll_poll_unlock(have);
5170
5171         return work;
5172 }
5173
5174 static void net_rx_action(struct softirq_action *h)
5175 {
5176         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5177         unsigned long time_limit = jiffies + 2;
5178         int budget = netdev_budget;
5179         LIST_HEAD(list);
5180         LIST_HEAD(repoll);
5181
5182         local_irq_disable();
5183         list_splice_init(&sd->poll_list, &list);
5184         local_irq_enable();
5185
5186         for (;;) {
5187                 struct napi_struct *n;
5188
5189                 if (list_empty(&list)) {
5190                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5191                                 return;
5192                         break;
5193                 }
5194
5195                 n = list_first_entry(&list, struct napi_struct, poll_list);
5196                 budget -= napi_poll(n, &repoll);
5197
5198                 /* If softirq window is exhausted then punt.
5199                  * Allow this to run for 2 jiffies since which will allow
5200                  * an average latency of 1.5/HZ.
5201                  */
5202                 if (unlikely(budget <= 0 ||
5203                              time_after_eq(jiffies, time_limit))) {
5204                         sd->time_squeeze++;
5205                         break;
5206                 }
5207         }
5208
5209         __kfree_skb_flush();
5210         local_irq_disable();
5211
5212         list_splice_tail_init(&sd->poll_list, &list);
5213         list_splice_tail(&repoll, &list);
5214         list_splice(&list, &sd->poll_list);
5215         if (!list_empty(&sd->poll_list))
5216                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5217
5218         net_rps_action_and_irq_enable(sd);
5219 }
5220
5221 struct netdev_adjacent {
5222         struct net_device *dev;
5223
5224         /* upper master flag, there can only be one master device per list */
5225         bool master;
5226
5227         /* counter for the number of times this device was added to us */
5228         u16 ref_nr;
5229
5230         /* private field for the users */
5231         void *private;
5232
5233         struct list_head list;
5234         struct rcu_head rcu;
5235 };
5236
5237 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5238                                                  struct list_head *adj_list)
5239 {
5240         struct netdev_adjacent *adj;
5241
5242         list_for_each_entry(adj, adj_list, list) {
5243                 if (adj->dev == adj_dev)
5244                         return adj;
5245         }
5246         return NULL;
5247 }
5248
5249 /**
5250  * netdev_has_upper_dev - Check if device is linked to an upper device
5251  * @dev: device
5252  * @upper_dev: upper device to check
5253  *
5254  * Find out if a device is linked to specified upper device and return true
5255  * in case it is. Note that this checks only immediate upper device,
5256  * not through a complete stack of devices. The caller must hold the RTNL lock.
5257  */
5258 bool netdev_has_upper_dev(struct net_device *dev,
5259                           struct net_device *upper_dev)
5260 {
5261         ASSERT_RTNL();
5262
5263         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5264 }
5265 EXPORT_SYMBOL(netdev_has_upper_dev);
5266
5267 /**
5268  * netdev_has_any_upper_dev - Check if device is linked to some device
5269  * @dev: device
5270  *
5271  * Find out if a device is linked to an upper device and return true in case
5272  * it is. The caller must hold the RTNL lock.
5273  */
5274 static bool netdev_has_any_upper_dev(struct net_device *dev)
5275 {
5276         ASSERT_RTNL();
5277
5278         return !list_empty(&dev->all_adj_list.upper);
5279 }
5280
5281 /**
5282  * netdev_master_upper_dev_get - Get master upper device
5283  * @dev: device
5284  *
5285  * Find a master upper device and return pointer to it or NULL in case
5286  * it's not there. The caller must hold the RTNL lock.
5287  */
5288 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5289 {
5290         struct netdev_adjacent *upper;
5291
5292         ASSERT_RTNL();
5293
5294         if (list_empty(&dev->adj_list.upper))
5295                 return NULL;
5296
5297         upper = list_first_entry(&dev->adj_list.upper,
5298                                  struct netdev_adjacent, list);
5299         if (likely(upper->master))
5300                 return upper->dev;
5301         return NULL;
5302 }
5303 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5304
5305 void *netdev_adjacent_get_private(struct list_head *adj_list)
5306 {
5307         struct netdev_adjacent *adj;
5308
5309         adj = list_entry(adj_list, struct netdev_adjacent, list);
5310
5311         return adj->private;
5312 }
5313 EXPORT_SYMBOL(netdev_adjacent_get_private);
5314
5315 /**
5316  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5317  * @dev: device
5318  * @iter: list_head ** of the current position
5319  *
5320  * Gets the next device from the dev's upper list, starting from iter
5321  * position. The caller must hold RCU read lock.
5322  */
5323 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5324                                                  struct list_head **iter)
5325 {
5326         struct netdev_adjacent *upper;
5327
5328         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5329
5330         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5331
5332         if (&upper->list == &dev->adj_list.upper)
5333                 return NULL;
5334
5335         *iter = &upper->list;
5336
5337         return upper->dev;
5338 }
5339 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5340
5341 /**
5342  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5343  * @dev: device
5344  * @iter: list_head ** of the current position
5345  *
5346  * Gets the next device from the dev's upper list, starting from iter
5347  * position. The caller must hold RCU read lock.
5348  */
5349 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5350                                                      struct list_head **iter)
5351 {
5352         struct netdev_adjacent *upper;
5353
5354         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5355
5356         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5357
5358         if (&upper->list == &dev->all_adj_list.upper)
5359                 return NULL;
5360
5361         *iter = &upper->list;
5362
5363         return upper->dev;
5364 }
5365 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5366
5367 /**
5368  * netdev_lower_get_next_private - Get the next ->private from the
5369  *                                 lower neighbour list
5370  * @dev: device
5371  * @iter: list_head ** of the current position
5372  *
5373  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5374  * list, starting from iter position. The caller must hold either hold the
5375  * RTNL lock or its own locking that guarantees that the neighbour lower
5376  * list will remain unchanged.
5377  */
5378 void *netdev_lower_get_next_private(struct net_device *dev,
5379                                     struct list_head **iter)
5380 {
5381         struct netdev_adjacent *lower;
5382
5383         lower = list_entry(*iter, struct netdev_adjacent, list);
5384
5385         if (&lower->list == &dev->adj_list.lower)
5386                 return NULL;
5387
5388         *iter = lower->list.next;
5389
5390         return lower->private;
5391 }
5392 EXPORT_SYMBOL(netdev_lower_get_next_private);
5393
5394 /**
5395  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5396  *                                     lower neighbour list, RCU
5397  *                                     variant
5398  * @dev: device
5399  * @iter: list_head ** of the current position
5400  *
5401  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5402  * list, starting from iter position. The caller must hold RCU read lock.
5403  */
5404 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5405                                         struct list_head **iter)
5406 {
5407         struct netdev_adjacent *lower;
5408
5409         WARN_ON_ONCE(!rcu_read_lock_held());
5410
5411         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5412
5413         if (&lower->list == &dev->adj_list.lower)
5414                 return NULL;
5415
5416         *iter = &lower->list;
5417
5418         return lower->private;
5419 }
5420 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5421
5422 /**
5423  * netdev_lower_get_next - Get the next device from the lower neighbour
5424  *                         list
5425  * @dev: device
5426  * @iter: list_head ** of the current position
5427  *
5428  * Gets the next netdev_adjacent from the dev's lower neighbour
5429  * list, starting from iter position. The caller must hold RTNL lock or
5430  * its own locking that guarantees that the neighbour lower
5431  * list will remain unchanged.
5432  */
5433 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5434 {
5435         struct netdev_adjacent *lower;
5436
5437         lower = list_entry(*iter, struct netdev_adjacent, list);
5438
5439         if (&lower->list == &dev->adj_list.lower)
5440                 return NULL;
5441
5442         *iter = lower->list.next;
5443
5444         return lower->dev;
5445 }
5446 EXPORT_SYMBOL(netdev_lower_get_next);
5447
5448 /**
5449  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5450  *                                     lower neighbour list, RCU
5451  *                                     variant
5452  * @dev: device
5453  *
5454  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5455  * list. The caller must hold RCU read lock.
5456  */
5457 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5458 {
5459         struct netdev_adjacent *lower;
5460
5461         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5462                         struct netdev_adjacent, list);
5463         if (lower)
5464                 return lower->private;
5465         return NULL;
5466 }
5467 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5468
5469 /**
5470  * netdev_master_upper_dev_get_rcu - Get master upper device
5471  * @dev: device
5472  *
5473  * Find a master upper device and return pointer to it or NULL in case
5474  * it's not there. The caller must hold the RCU read lock.
5475  */
5476 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5477 {
5478         struct netdev_adjacent *upper;
5479
5480         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5481                                        struct netdev_adjacent, list);
5482         if (upper && likely(upper->master))
5483                 return upper->dev;
5484         return NULL;
5485 }
5486 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5487
5488 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5489                               struct net_device *adj_dev,
5490                               struct list_head *dev_list)
5491 {
5492         char linkname[IFNAMSIZ+7];
5493         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5494                 "upper_%s" : "lower_%s", adj_dev->name);
5495         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5496                                  linkname);
5497 }
5498 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5499                                char *name,
5500                                struct list_head *dev_list)
5501 {
5502         char linkname[IFNAMSIZ+7];
5503         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5504                 "upper_%s" : "lower_%s", name);
5505         sysfs_remove_link(&(dev->dev.kobj), linkname);
5506 }
5507
5508 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5509                                                  struct net_device *adj_dev,
5510                                                  struct list_head *dev_list)
5511 {
5512         return (dev_list == &dev->adj_list.upper ||
5513                 dev_list == &dev->adj_list.lower) &&
5514                 net_eq(dev_net(dev), dev_net(adj_dev));
5515 }
5516
5517 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5518                                         struct net_device *adj_dev,
5519                                         struct list_head *dev_list,
5520                                         void *private, bool master)
5521 {
5522         struct netdev_adjacent *adj;
5523         int ret;
5524
5525         adj = __netdev_find_adj(adj_dev, dev_list);
5526
5527         if (adj) {
5528                 adj->ref_nr++;
5529                 return 0;
5530         }
5531
5532         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5533         if (!adj)
5534                 return -ENOMEM;
5535
5536         adj->dev = adj_dev;
5537         adj->master = master;
5538         adj->ref_nr = 1;
5539         adj->private = private;
5540         dev_hold(adj_dev);
5541
5542         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5543                  adj_dev->name, dev->name, adj_dev->name);
5544
5545         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5546                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5547                 if (ret)
5548                         goto free_adj;
5549         }
5550
5551         /* Ensure that master link is always the first item in list. */
5552         if (master) {
5553                 ret = sysfs_create_link(&(dev->dev.kobj),
5554                                         &(adj_dev->dev.kobj), "master");
5555                 if (ret)
5556                         goto remove_symlinks;
5557
5558                 list_add_rcu(&adj->list, dev_list);
5559         } else {
5560                 list_add_tail_rcu(&adj->list, dev_list);
5561         }
5562
5563         return 0;
5564
5565 remove_symlinks:
5566         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5567                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5568 free_adj:
5569         kfree(adj);
5570         dev_put(adj_dev);
5571
5572         return ret;
5573 }
5574
5575 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5576                                          struct net_device *adj_dev,
5577                                          struct list_head *dev_list)
5578 {
5579         struct netdev_adjacent *adj;
5580
5581         adj = __netdev_find_adj(adj_dev, dev_list);
5582
5583         if (!adj) {
5584                 pr_err("tried to remove device %s from %s\n",
5585                        dev->name, adj_dev->name);
5586                 BUG();
5587         }
5588
5589         if (adj->ref_nr > 1) {
5590                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5591                          adj->ref_nr-1);
5592                 adj->ref_nr--;
5593                 return;
5594         }
5595
5596         if (adj->master)
5597                 sysfs_remove_link(&(dev->dev.kobj), "master");
5598
5599         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5600                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5601
5602         list_del_rcu(&adj->list);
5603         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5604                  adj_dev->name, dev->name, adj_dev->name);
5605         dev_put(adj_dev);
5606         kfree_rcu(adj, rcu);
5607 }
5608
5609 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5610                                             struct net_device *upper_dev,
5611                                             struct list_head *up_list,
5612                                             struct list_head *down_list,
5613                                             void *private, bool master)
5614 {
5615         int ret;
5616
5617         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5618                                            master);
5619         if (ret)
5620                 return ret;
5621
5622         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5623                                            false);
5624         if (ret) {
5625                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5626                 return ret;
5627         }
5628
5629         return 0;
5630 }
5631
5632 static int __netdev_adjacent_dev_link(struct net_device *dev,
5633                                       struct net_device *upper_dev)
5634 {
5635         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5636                                                 &dev->all_adj_list.upper,
5637                                                 &upper_dev->all_adj_list.lower,
5638                                                 NULL, false);
5639 }
5640
5641 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5642                                                struct net_device *upper_dev,
5643                                                struct list_head *up_list,
5644                                                struct list_head *down_list)
5645 {
5646         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5647         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5648 }
5649
5650 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5651                                          struct net_device *upper_dev)
5652 {
5653         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5654                                            &dev->all_adj_list.upper,
5655                                            &upper_dev->all_adj_list.lower);
5656 }
5657
5658 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5659                                                 struct net_device *upper_dev,
5660                                                 void *private, bool master)
5661 {
5662         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5663
5664         if (ret)
5665                 return ret;
5666
5667         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5668                                                &dev->adj_list.upper,
5669                                                &upper_dev->adj_list.lower,
5670                                                private, master);
5671         if (ret) {
5672                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5673                 return ret;
5674         }
5675
5676         return 0;
5677 }
5678
5679 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5680                                                    struct net_device *upper_dev)
5681 {
5682         __netdev_adjacent_dev_unlink(dev, upper_dev);
5683         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5684                                            &dev->adj_list.upper,
5685                                            &upper_dev->adj_list.lower);
5686 }
5687
5688 static int __netdev_upper_dev_link(struct net_device *dev,
5689                                    struct net_device *upper_dev, bool master,
5690                                    void *upper_priv, void *upper_info)
5691 {
5692         struct netdev_notifier_changeupper_info changeupper_info;
5693         struct netdev_adjacent *i, *j, *to_i, *to_j;
5694         int ret = 0;
5695
5696         ASSERT_RTNL();
5697
5698         if (dev == upper_dev)
5699                 return -EBUSY;
5700
5701         /* To prevent loops, check if dev is not upper device to upper_dev. */
5702         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5703                 return -EBUSY;
5704
5705         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5706                 return -EEXIST;
5707
5708         if (master && netdev_master_upper_dev_get(dev))
5709                 return -EBUSY;
5710
5711         changeupper_info.upper_dev = upper_dev;
5712         changeupper_info.master = master;
5713         changeupper_info.linking = true;
5714         changeupper_info.upper_info = upper_info;
5715
5716         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5717                                             &changeupper_info.info);
5718         ret = notifier_to_errno(ret);
5719         if (ret)
5720                 return ret;
5721
5722         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5723                                                    master);
5724         if (ret)
5725                 return ret;
5726
5727         /* Now that we linked these devs, make all the upper_dev's
5728          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5729          * versa, and don't forget the devices itself. All of these
5730          * links are non-neighbours.
5731          */
5732         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5733                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5734                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5735                                  i->dev->name, j->dev->name);
5736                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5737                         if (ret)
5738                                 goto rollback_mesh;
5739                 }
5740         }
5741
5742         /* add dev to every upper_dev's upper device */
5743         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5744                 pr_debug("linking %s's upper device %s with %s\n",
5745                          upper_dev->name, i->dev->name, dev->name);
5746                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5747                 if (ret)
5748                         goto rollback_upper_mesh;
5749         }
5750
5751         /* add upper_dev to every dev's lower device */
5752         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5753                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5754                          i->dev->name, upper_dev->name);
5755                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5756                 if (ret)
5757                         goto rollback_lower_mesh;
5758         }
5759
5760         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5761                                             &changeupper_info.info);
5762         ret = notifier_to_errno(ret);
5763         if (ret)
5764                 goto rollback_lower_mesh;
5765
5766         return 0;
5767
5768 rollback_lower_mesh:
5769         to_i = i;
5770         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5771                 if (i == to_i)
5772                         break;
5773                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5774         }
5775
5776         i = NULL;
5777
5778 rollback_upper_mesh:
5779         to_i = i;
5780         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5781                 if (i == to_i)
5782                         break;
5783                 __netdev_adjacent_dev_unlink(dev, i->dev);
5784         }
5785
5786         i = j = NULL;
5787
5788 rollback_mesh:
5789         to_i = i;
5790         to_j = j;
5791         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5792                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5793                         if (i == to_i && j == to_j)
5794                                 break;
5795                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5796                 }
5797                 if (i == to_i)
5798                         break;
5799         }
5800
5801         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5802
5803         return ret;
5804 }
5805
5806 /**
5807  * netdev_upper_dev_link - Add a link to the upper device
5808  * @dev: device
5809  * @upper_dev: new upper device
5810  *
5811  * Adds a link to device which is upper to this one. The caller must hold
5812  * the RTNL lock. On a failure a negative errno code is returned.
5813  * On success the reference counts are adjusted and the function
5814  * returns zero.
5815  */
5816 int netdev_upper_dev_link(struct net_device *dev,
5817                           struct net_device *upper_dev)
5818 {
5819         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5820 }
5821 EXPORT_SYMBOL(netdev_upper_dev_link);
5822
5823 /**
5824  * netdev_master_upper_dev_link - Add a master link to the upper device
5825  * @dev: device
5826  * @upper_dev: new upper device
5827  * @upper_priv: upper device private
5828  * @upper_info: upper info to be passed down via notifier
5829  *
5830  * Adds a link to device which is upper to this one. In this case, only
5831  * one master upper device can be linked, although other non-master devices
5832  * might be linked as well. The caller must hold the RTNL lock.
5833  * On a failure a negative errno code is returned. On success the reference
5834  * counts are adjusted and the function returns zero.
5835  */
5836 int netdev_master_upper_dev_link(struct net_device *dev,
5837                                  struct net_device *upper_dev,
5838                                  void *upper_priv, void *upper_info)
5839 {
5840         return __netdev_upper_dev_link(dev, upper_dev, true,
5841                                        upper_priv, upper_info);
5842 }
5843 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5844
5845 /**
5846  * netdev_upper_dev_unlink - Removes a link to upper device
5847  * @dev: device
5848  * @upper_dev: new upper device
5849  *
5850  * Removes a link to device which is upper to this one. The caller must hold
5851  * the RTNL lock.
5852  */
5853 void netdev_upper_dev_unlink(struct net_device *dev,
5854                              struct net_device *upper_dev)
5855 {
5856         struct netdev_notifier_changeupper_info changeupper_info;
5857         struct netdev_adjacent *i, *j;
5858         ASSERT_RTNL();
5859
5860         changeupper_info.upper_dev = upper_dev;
5861         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5862         changeupper_info.linking = false;
5863
5864         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5865                                       &changeupper_info.info);
5866
5867         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5868
5869         /* Here is the tricky part. We must remove all dev's lower
5870          * devices from all upper_dev's upper devices and vice
5871          * versa, to maintain the graph relationship.
5872          */
5873         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5874                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5875                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5876
5877         /* remove also the devices itself from lower/upper device
5878          * list
5879          */
5880         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5881                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5882
5883         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5884                 __netdev_adjacent_dev_unlink(dev, i->dev);
5885
5886         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5887                                       &changeupper_info.info);
5888 }
5889 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5890
5891 /**
5892  * netdev_bonding_info_change - Dispatch event about slave change
5893  * @dev: device
5894  * @bonding_info: info to dispatch
5895  *
5896  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5897  * The caller must hold the RTNL lock.
5898  */
5899 void netdev_bonding_info_change(struct net_device *dev,
5900                                 struct netdev_bonding_info *bonding_info)
5901 {
5902         struct netdev_notifier_bonding_info     info;
5903
5904         memcpy(&info.bonding_info, bonding_info,
5905                sizeof(struct netdev_bonding_info));
5906         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5907                                       &info.info);
5908 }
5909 EXPORT_SYMBOL(netdev_bonding_info_change);
5910
5911 static void netdev_adjacent_add_links(struct net_device *dev)
5912 {
5913         struct netdev_adjacent *iter;
5914
5915         struct net *net = dev_net(dev);
5916
5917         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5918                 if (!net_eq(net,dev_net(iter->dev)))
5919                         continue;
5920                 netdev_adjacent_sysfs_add(iter->dev, dev,
5921                                           &iter->dev->adj_list.lower);
5922                 netdev_adjacent_sysfs_add(dev, iter->dev,
5923                                           &dev->adj_list.upper);
5924         }
5925
5926         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5927                 if (!net_eq(net,dev_net(iter->dev)))
5928                         continue;
5929                 netdev_adjacent_sysfs_add(iter->dev, dev,
5930                                           &iter->dev->adj_list.upper);
5931                 netdev_adjacent_sysfs_add(dev, iter->dev,
5932                                           &dev->adj_list.lower);
5933         }
5934 }
5935
5936 static void netdev_adjacent_del_links(struct net_device *dev)
5937 {
5938         struct netdev_adjacent *iter;
5939
5940         struct net *net = dev_net(dev);
5941
5942         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5943                 if (!net_eq(net,dev_net(iter->dev)))
5944                         continue;
5945                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5946                                           &iter->dev->adj_list.lower);
5947                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5948                                           &dev->adj_list.upper);
5949         }
5950
5951         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5952                 if (!net_eq(net,dev_net(iter->dev)))
5953                         continue;
5954                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5955                                           &iter->dev->adj_list.upper);
5956                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5957                                           &dev->adj_list.lower);
5958         }
5959 }
5960
5961 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5962 {
5963         struct netdev_adjacent *iter;
5964
5965         struct net *net = dev_net(dev);
5966
5967         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5968                 if (!net_eq(net,dev_net(iter->dev)))
5969                         continue;
5970                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5971                                           &iter->dev->adj_list.lower);
5972                 netdev_adjacent_sysfs_add(iter->dev, dev,
5973                                           &iter->dev->adj_list.lower);
5974         }
5975
5976         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5977                 if (!net_eq(net,dev_net(iter->dev)))
5978                         continue;
5979                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5980                                           &iter->dev->adj_list.upper);
5981                 netdev_adjacent_sysfs_add(iter->dev, dev,
5982                                           &iter->dev->adj_list.upper);
5983         }
5984 }
5985
5986 void *netdev_lower_dev_get_private(struct net_device *dev,
5987                                    struct net_device *lower_dev)
5988 {
5989         struct netdev_adjacent *lower;
5990
5991         if (!lower_dev)
5992                 return NULL;
5993         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5994         if (!lower)
5995                 return NULL;
5996
5997         return lower->private;
5998 }
5999 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6000
6001
6002 int dev_get_nest_level(struct net_device *dev,
6003                        bool (*type_check)(const struct net_device *dev))
6004 {
6005         struct net_device *lower = NULL;
6006         struct list_head *iter;
6007         int max_nest = -1;
6008         int nest;
6009
6010         ASSERT_RTNL();
6011
6012         netdev_for_each_lower_dev(dev, lower, iter) {
6013                 nest = dev_get_nest_level(lower, type_check);
6014                 if (max_nest < nest)
6015                         max_nest = nest;
6016         }
6017
6018         if (type_check(dev))
6019                 max_nest++;
6020
6021         return max_nest;
6022 }
6023 EXPORT_SYMBOL(dev_get_nest_level);
6024
6025 /**
6026  * netdev_lower_change - Dispatch event about lower device state change
6027  * @lower_dev: device
6028  * @lower_state_info: state to dispatch
6029  *
6030  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6031  * The caller must hold the RTNL lock.
6032  */
6033 void netdev_lower_state_changed(struct net_device *lower_dev,
6034                                 void *lower_state_info)
6035 {
6036         struct netdev_notifier_changelowerstate_info changelowerstate_info;
6037
6038         ASSERT_RTNL();
6039         changelowerstate_info.lower_state_info = lower_state_info;
6040         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6041                                       &changelowerstate_info.info);
6042 }
6043 EXPORT_SYMBOL(netdev_lower_state_changed);
6044
6045 static void dev_change_rx_flags(struct net_device *dev, int flags)
6046 {
6047         const struct net_device_ops *ops = dev->netdev_ops;
6048
6049         if (ops->ndo_change_rx_flags)
6050                 ops->ndo_change_rx_flags(dev, flags);
6051 }
6052
6053 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6054 {
6055         unsigned int old_flags = dev->flags;
6056         kuid_t uid;
6057         kgid_t gid;
6058
6059         ASSERT_RTNL();
6060
6061         dev->flags |= IFF_PROMISC;
6062         dev->promiscuity += inc;
6063         if (dev->promiscuity == 0) {
6064                 /*
6065                  * Avoid overflow.
6066                  * If inc causes overflow, untouch promisc and return error.
6067                  */
6068                 if (inc < 0)
6069                         dev->flags &= ~IFF_PROMISC;
6070                 else {
6071                         dev->promiscuity -= inc;
6072                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6073                                 dev->name);
6074                         return -EOVERFLOW;
6075                 }
6076         }
6077         if (dev->flags != old_flags) {
6078                 pr_info("device %s %s promiscuous mode\n",
6079                         dev->name,
6080                         dev->flags & IFF_PROMISC ? "entered" : "left");
6081                 if (audit_enabled) {
6082                         current_uid_gid(&uid, &gid);
6083                         audit_log(current->audit_context, GFP_ATOMIC,
6084                                 AUDIT_ANOM_PROMISCUOUS,
6085                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6086                                 dev->name, (dev->flags & IFF_PROMISC),
6087                                 (old_flags & IFF_PROMISC),
6088                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6089                                 from_kuid(&init_user_ns, uid),
6090                                 from_kgid(&init_user_ns, gid),
6091                                 audit_get_sessionid(current));
6092                 }
6093
6094                 dev_change_rx_flags(dev, IFF_PROMISC);
6095         }
6096         if (notify)
6097                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6098         return 0;
6099 }
6100
6101 /**
6102  *      dev_set_promiscuity     - update promiscuity count on a device
6103  *      @dev: device
6104  *      @inc: modifier
6105  *
6106  *      Add or remove promiscuity from a device. While the count in the device
6107  *      remains above zero the interface remains promiscuous. Once it hits zero
6108  *      the device reverts back to normal filtering operation. A negative inc
6109  *      value is used to drop promiscuity on the device.
6110  *      Return 0 if successful or a negative errno code on error.
6111  */
6112 int dev_set_promiscuity(struct net_device *dev, int inc)
6113 {
6114         unsigned int old_flags = dev->flags;
6115         int err;
6116
6117         err = __dev_set_promiscuity(dev, inc, true);
6118         if (err < 0)
6119                 return err;
6120         if (dev->flags != old_flags)
6121                 dev_set_rx_mode(dev);
6122         return err;
6123 }
6124 EXPORT_SYMBOL(dev_set_promiscuity);
6125
6126 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6127 {
6128         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6129
6130         ASSERT_RTNL();
6131
6132         dev->flags |= IFF_ALLMULTI;
6133         dev->allmulti += inc;
6134         if (dev->allmulti == 0) {
6135                 /*
6136                  * Avoid overflow.
6137                  * If inc causes overflow, untouch allmulti and return error.
6138                  */
6139                 if (inc < 0)
6140                         dev->flags &= ~IFF_ALLMULTI;
6141                 else {
6142                         dev->allmulti -= inc;
6143                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6144                                 dev->name);
6145                         return -EOVERFLOW;
6146                 }
6147         }
6148         if (dev->flags ^ old_flags) {
6149                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6150                 dev_set_rx_mode(dev);
6151                 if (notify)
6152                         __dev_notify_flags(dev, old_flags,
6153                                            dev->gflags ^ old_gflags);
6154         }
6155         return 0;
6156 }
6157
6158 /**
6159  *      dev_set_allmulti        - update allmulti count on a device
6160  *      @dev: device
6161  *      @inc: modifier
6162  *
6163  *      Add or remove reception of all multicast frames to a device. While the
6164  *      count in the device remains above zero the interface remains listening
6165  *      to all interfaces. Once it hits zero the device reverts back to normal
6166  *      filtering operation. A negative @inc value is used to drop the counter
6167  *      when releasing a resource needing all multicasts.
6168  *      Return 0 if successful or a negative errno code on error.
6169  */
6170
6171 int dev_set_allmulti(struct net_device *dev, int inc)
6172 {
6173         return __dev_set_allmulti(dev, inc, true);
6174 }
6175 EXPORT_SYMBOL(dev_set_allmulti);
6176
6177 /*
6178  *      Upload unicast and multicast address lists to device and
6179  *      configure RX filtering. When the device doesn't support unicast
6180  *      filtering it is put in promiscuous mode while unicast addresses
6181  *      are present.
6182  */
6183 void __dev_set_rx_mode(struct net_device *dev)
6184 {
6185         const struct net_device_ops *ops = dev->netdev_ops;
6186
6187         /* dev_open will call this function so the list will stay sane. */
6188         if (!(dev->flags&IFF_UP))
6189                 return;
6190
6191         if (!netif_device_present(dev))
6192                 return;
6193
6194         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6195                 /* Unicast addresses changes may only happen under the rtnl,
6196                  * therefore calling __dev_set_promiscuity here is safe.
6197                  */
6198                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6199                         __dev_set_promiscuity(dev, 1, false);
6200                         dev->uc_promisc = true;
6201                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6202                         __dev_set_promiscuity(dev, -1, false);
6203                         dev->uc_promisc = false;
6204                 }
6205         }
6206
6207         if (ops->ndo_set_rx_mode)
6208                 ops->ndo_set_rx_mode(dev);
6209 }
6210
6211 void dev_set_rx_mode(struct net_device *dev)
6212 {
6213         netif_addr_lock_bh(dev);
6214         __dev_set_rx_mode(dev);
6215         netif_addr_unlock_bh(dev);
6216 }
6217
6218 /**
6219  *      dev_get_flags - get flags reported to userspace
6220  *      @dev: device
6221  *
6222  *      Get the combination of flag bits exported through APIs to userspace.
6223  */
6224 unsigned int dev_get_flags(const struct net_device *dev)
6225 {
6226         unsigned int flags;
6227
6228         flags = (dev->flags & ~(IFF_PROMISC |
6229                                 IFF_ALLMULTI |
6230                                 IFF_RUNNING |
6231                                 IFF_LOWER_UP |
6232                                 IFF_DORMANT)) |
6233                 (dev->gflags & (IFF_PROMISC |
6234                                 IFF_ALLMULTI));
6235
6236         if (netif_running(dev)) {
6237                 if (netif_oper_up(dev))
6238                         flags |= IFF_RUNNING;
6239                 if (netif_carrier_ok(dev))
6240                         flags |= IFF_LOWER_UP;
6241                 if (netif_dormant(dev))
6242                         flags |= IFF_DORMANT;
6243         }
6244
6245         return flags;
6246 }
6247 EXPORT_SYMBOL(dev_get_flags);
6248
6249 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6250 {
6251         unsigned int old_flags = dev->flags;
6252         int ret;
6253
6254         ASSERT_RTNL();
6255
6256         /*
6257          *      Set the flags on our device.
6258          */
6259
6260         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6261                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6262                                IFF_AUTOMEDIA)) |
6263                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6264                                     IFF_ALLMULTI));
6265
6266         /*
6267          *      Load in the correct multicast list now the flags have changed.
6268          */
6269
6270         if ((old_flags ^ flags) & IFF_MULTICAST)
6271                 dev_change_rx_flags(dev, IFF_MULTICAST);
6272
6273         dev_set_rx_mode(dev);
6274
6275         /*
6276          *      Have we downed the interface. We handle IFF_UP ourselves
6277          *      according to user attempts to set it, rather than blindly
6278          *      setting it.
6279          */
6280
6281         ret = 0;
6282         if ((old_flags ^ flags) & IFF_UP)
6283                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6284
6285         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6286                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6287                 unsigned int old_flags = dev->flags;
6288
6289                 dev->gflags ^= IFF_PROMISC;
6290
6291                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6292                         if (dev->flags != old_flags)
6293                                 dev_set_rx_mode(dev);
6294         }
6295
6296         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6297            is important. Some (broken) drivers set IFF_PROMISC, when
6298            IFF_ALLMULTI is requested not asking us and not reporting.
6299          */
6300         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6301                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6302
6303                 dev->gflags ^= IFF_ALLMULTI;
6304                 __dev_set_allmulti(dev, inc, false);
6305         }
6306
6307         return ret;
6308 }
6309
6310 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6311                         unsigned int gchanges)
6312 {
6313         unsigned int changes = dev->flags ^ old_flags;
6314
6315         if (gchanges)
6316                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6317
6318         if (changes & IFF_UP) {
6319                 if (dev->flags & IFF_UP)
6320                         call_netdevice_notifiers(NETDEV_UP, dev);
6321                 else
6322                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6323         }
6324
6325         if (dev->flags & IFF_UP &&
6326             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6327                 struct netdev_notifier_change_info change_info;
6328
6329                 change_info.flags_changed = changes;
6330                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6331                                               &change_info.info);
6332         }
6333 }
6334
6335 /**
6336  *      dev_change_flags - change device settings
6337  *      @dev: device
6338  *      @flags: device state flags
6339  *
6340  *      Change settings on device based state flags. The flags are
6341  *      in the userspace exported format.
6342  */
6343 int dev_change_flags(struct net_device *dev, unsigned int flags)
6344 {
6345         int ret;
6346         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6347
6348         ret = __dev_change_flags(dev, flags);
6349         if (ret < 0)
6350                 return ret;
6351
6352         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6353         __dev_notify_flags(dev, old_flags, changes);
6354         return ret;
6355 }
6356 EXPORT_SYMBOL(dev_change_flags);
6357
6358 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6359 {
6360         const struct net_device_ops *ops = dev->netdev_ops;
6361
6362         if (ops->ndo_change_mtu)
6363                 return ops->ndo_change_mtu(dev, new_mtu);
6364
6365         dev->mtu = new_mtu;
6366         return 0;
6367 }
6368
6369 /**
6370  *      dev_set_mtu - Change maximum transfer unit
6371  *      @dev: device
6372  *      @new_mtu: new transfer unit
6373  *
6374  *      Change the maximum transfer size of the network device.
6375  */
6376 int dev_set_mtu(struct net_device *dev, int new_mtu)
6377 {
6378         int err, orig_mtu;
6379
6380         if (new_mtu == dev->mtu)
6381                 return 0;
6382
6383         /*      MTU must be positive.    */
6384         if (new_mtu < 0)
6385                 return -EINVAL;
6386
6387         if (!netif_device_present(dev))
6388                 return -ENODEV;
6389
6390         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6391         err = notifier_to_errno(err);
6392         if (err)
6393                 return err;
6394
6395         orig_mtu = dev->mtu;
6396         err = __dev_set_mtu(dev, new_mtu);
6397
6398         if (!err) {
6399                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6400                 err = notifier_to_errno(err);
6401                 if (err) {
6402                         /* setting mtu back and notifying everyone again,
6403                          * so that they have a chance to revert changes.
6404                          */
6405                         __dev_set_mtu(dev, orig_mtu);
6406                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6407                 }
6408         }
6409         return err;
6410 }
6411 EXPORT_SYMBOL(dev_set_mtu);
6412
6413 /**
6414  *      dev_set_group - Change group this device belongs to
6415  *      @dev: device
6416  *      @new_group: group this device should belong to
6417  */
6418 void dev_set_group(struct net_device *dev, int new_group)
6419 {
6420         dev->group = new_group;
6421 }
6422 EXPORT_SYMBOL(dev_set_group);
6423
6424 /**
6425  *      dev_set_mac_address - Change Media Access Control Address
6426  *      @dev: device
6427  *      @sa: new address
6428  *
6429  *      Change the hardware (MAC) address of the device
6430  */
6431 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6432 {
6433         const struct net_device_ops *ops = dev->netdev_ops;
6434         int err;
6435
6436         if (!ops->ndo_set_mac_address)
6437                 return -EOPNOTSUPP;
6438         if (sa->sa_family != dev->type)
6439                 return -EINVAL;
6440         if (!netif_device_present(dev))
6441                 return -ENODEV;
6442         err = ops->ndo_set_mac_address(dev, sa);
6443         if (err)
6444                 return err;
6445         dev->addr_assign_type = NET_ADDR_SET;
6446         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6447         add_device_randomness(dev->dev_addr, dev->addr_len);
6448         return 0;
6449 }
6450 EXPORT_SYMBOL(dev_set_mac_address);
6451
6452 /**
6453  *      dev_change_carrier - Change device carrier
6454  *      @dev: device
6455  *      @new_carrier: new value
6456  *
6457  *      Change device carrier
6458  */
6459 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6460 {
6461         const struct net_device_ops *ops = dev->netdev_ops;
6462
6463         if (!ops->ndo_change_carrier)
6464                 return -EOPNOTSUPP;
6465         if (!netif_device_present(dev))
6466                 return -ENODEV;
6467         return ops->ndo_change_carrier(dev, new_carrier);
6468 }
6469 EXPORT_SYMBOL(dev_change_carrier);
6470
6471 /**
6472  *      dev_get_phys_port_id - Get device physical port ID
6473  *      @dev: device
6474  *      @ppid: port ID
6475  *
6476  *      Get device physical port ID
6477  */
6478 int dev_get_phys_port_id(struct net_device *dev,
6479                          struct netdev_phys_item_id *ppid)
6480 {
6481         const struct net_device_ops *ops = dev->netdev_ops;
6482
6483         if (!ops->ndo_get_phys_port_id)
6484                 return -EOPNOTSUPP;
6485         return ops->ndo_get_phys_port_id(dev, ppid);
6486 }
6487 EXPORT_SYMBOL(dev_get_phys_port_id);
6488
6489 /**
6490  *      dev_get_phys_port_name - Get device physical port name
6491  *      @dev: device
6492  *      @name: port name
6493  *      @len: limit of bytes to copy to name
6494  *
6495  *      Get device physical port name
6496  */
6497 int dev_get_phys_port_name(struct net_device *dev,
6498                            char *name, size_t len)
6499 {
6500         const struct net_device_ops *ops = dev->netdev_ops;
6501
6502         if (!ops->ndo_get_phys_port_name)
6503                 return -EOPNOTSUPP;
6504         return ops->ndo_get_phys_port_name(dev, name, len);
6505 }
6506 EXPORT_SYMBOL(dev_get_phys_port_name);
6507
6508 /**
6509  *      dev_change_proto_down - update protocol port state information
6510  *      @dev: device
6511  *      @proto_down: new value
6512  *
6513  *      This info can be used by switch drivers to set the phys state of the
6514  *      port.
6515  */
6516 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6517 {
6518         const struct net_device_ops *ops = dev->netdev_ops;
6519
6520         if (!ops->ndo_change_proto_down)
6521                 return -EOPNOTSUPP;
6522         if (!netif_device_present(dev))
6523                 return -ENODEV;
6524         return ops->ndo_change_proto_down(dev, proto_down);
6525 }
6526 EXPORT_SYMBOL(dev_change_proto_down);
6527
6528 /**
6529  *      dev_new_index   -       allocate an ifindex
6530  *      @net: the applicable net namespace
6531  *
6532  *      Returns a suitable unique value for a new device interface
6533  *      number.  The caller must hold the rtnl semaphore or the
6534  *      dev_base_lock to be sure it remains unique.
6535  */
6536 static int dev_new_index(struct net *net)
6537 {
6538         int ifindex = net->ifindex;
6539         for (;;) {
6540                 if (++ifindex <= 0)
6541                         ifindex = 1;
6542                 if (!__dev_get_by_index(net, ifindex))
6543                         return net->ifindex = ifindex;
6544         }
6545 }
6546
6547 /* Delayed registration/unregisteration */
6548 static LIST_HEAD(net_todo_list);
6549 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6550
6551 static void net_set_todo(struct net_device *dev)
6552 {
6553         list_add_tail(&dev->todo_list, &net_todo_list);
6554         dev_net(dev)->dev_unreg_count++;
6555 }
6556
6557 static void rollback_registered_many(struct list_head *head)
6558 {
6559         struct net_device *dev, *tmp;
6560         LIST_HEAD(close_head);
6561
6562         BUG_ON(dev_boot_phase);
6563         ASSERT_RTNL();
6564
6565         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6566                 /* Some devices call without registering
6567                  * for initialization unwind. Remove those
6568                  * devices and proceed with the remaining.
6569                  */
6570                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6571                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6572                                  dev->name, dev);
6573
6574                         WARN_ON(1);
6575                         list_del(&dev->unreg_list);
6576                         continue;
6577                 }
6578                 dev->dismantle = true;
6579                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6580         }
6581
6582         /* If device is running, close it first. */
6583         list_for_each_entry(dev, head, unreg_list)
6584                 list_add_tail(&dev->close_list, &close_head);
6585         dev_close_many(&close_head, true);
6586
6587         list_for_each_entry(dev, head, unreg_list) {
6588                 /* And unlink it from device chain. */
6589                 unlist_netdevice(dev);
6590
6591                 dev->reg_state = NETREG_UNREGISTERING;
6592                 on_each_cpu(flush_backlog, dev, 1);
6593         }
6594
6595         synchronize_net();
6596
6597         list_for_each_entry(dev, head, unreg_list) {
6598                 struct sk_buff *skb = NULL;
6599
6600                 /* Shutdown queueing discipline. */
6601                 dev_shutdown(dev);
6602
6603
6604                 /* Notify protocols, that we are about to destroy
6605                    this device. They should clean all the things.
6606                 */
6607                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6608
6609                 if (!dev->rtnl_link_ops ||
6610                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6611                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6612                                                      GFP_KERNEL);
6613
6614                 /*
6615                  *      Flush the unicast and multicast chains
6616                  */
6617                 dev_uc_flush(dev);
6618                 dev_mc_flush(dev);
6619
6620                 if (dev->netdev_ops->ndo_uninit)
6621                         dev->netdev_ops->ndo_uninit(dev);
6622
6623                 if (skb)
6624                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6625
6626                 /* Notifier chain MUST detach us all upper devices. */
6627                 WARN_ON(netdev_has_any_upper_dev(dev));
6628
6629                 /* Remove entries from kobject tree */
6630                 netdev_unregister_kobject(dev);
6631 #ifdef CONFIG_XPS
6632                 /* Remove XPS queueing entries */
6633                 netif_reset_xps_queues_gt(dev, 0);
6634 #endif
6635         }
6636
6637         synchronize_net();
6638
6639         list_for_each_entry(dev, head, unreg_list)
6640                 dev_put(dev);
6641 }
6642
6643 static void rollback_registered(struct net_device *dev)
6644 {
6645         LIST_HEAD(single);
6646
6647         list_add(&dev->unreg_list, &single);
6648         rollback_registered_many(&single);
6649         list_del(&single);
6650 }
6651
6652 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6653         struct net_device *upper, netdev_features_t features)
6654 {
6655         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6656         netdev_features_t feature;
6657         int feature_bit;
6658
6659         for_each_netdev_feature(&upper_disables, feature_bit) {
6660                 feature = __NETIF_F_BIT(feature_bit);
6661                 if (!(upper->wanted_features & feature)
6662                     && (features & feature)) {
6663                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6664                                    &feature, upper->name);
6665                         features &= ~feature;
6666                 }
6667         }
6668
6669         return features;
6670 }
6671
6672 static void netdev_sync_lower_features(struct net_device *upper,
6673         struct net_device *lower, netdev_features_t features)
6674 {
6675         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6676         netdev_features_t feature;
6677         int feature_bit;
6678
6679         for_each_netdev_feature(&upper_disables, feature_bit) {
6680                 feature = __NETIF_F_BIT(feature_bit);
6681                 if (!(features & feature) && (lower->features & feature)) {
6682                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6683                                    &feature, lower->name);
6684                         lower->wanted_features &= ~feature;
6685                         netdev_update_features(lower);
6686
6687                         if (unlikely(lower->features & feature))
6688                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6689                                             &feature, lower->name);
6690                 }
6691         }
6692 }
6693
6694 static netdev_features_t netdev_fix_features(struct net_device *dev,
6695         netdev_features_t features)
6696 {
6697         /* Fix illegal checksum combinations */
6698         if ((features & NETIF_F_HW_CSUM) &&
6699             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6700                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6701                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6702         }
6703
6704         /* TSO requires that SG is present as well. */
6705         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6706                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6707                 features &= ~NETIF_F_ALL_TSO;
6708         }
6709
6710         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6711                                         !(features & NETIF_F_IP_CSUM)) {
6712                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6713                 features &= ~NETIF_F_TSO;
6714                 features &= ~NETIF_F_TSO_ECN;
6715         }
6716
6717         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6718                                          !(features & NETIF_F_IPV6_CSUM)) {
6719                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6720                 features &= ~NETIF_F_TSO6;
6721         }
6722
6723         /* TSO ECN requires that TSO is present as well. */
6724         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6725                 features &= ~NETIF_F_TSO_ECN;
6726
6727         /* Software GSO depends on SG. */
6728         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6729                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6730                 features &= ~NETIF_F_GSO;
6731         }
6732
6733         /* UFO needs SG and checksumming */
6734         if (features & NETIF_F_UFO) {
6735                 /* maybe split UFO into V4 and V6? */
6736                 if (!(features & NETIF_F_HW_CSUM) &&
6737                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6738                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6739                         netdev_dbg(dev,
6740                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6741                         features &= ~NETIF_F_UFO;
6742                 }
6743
6744                 if (!(features & NETIF_F_SG)) {
6745                         netdev_dbg(dev,
6746                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6747                         features &= ~NETIF_F_UFO;
6748                 }
6749         }
6750
6751         /* GSO partial features require GSO partial be set */
6752         if ((features & dev->gso_partial_features) &&
6753             !(features & NETIF_F_GSO_PARTIAL)) {
6754                 netdev_dbg(dev,
6755                            "Dropping partially supported GSO features since no GSO partial.\n");
6756                 features &= ~dev->gso_partial_features;
6757         }
6758
6759 #ifdef CONFIG_NET_RX_BUSY_POLL
6760         if (dev->netdev_ops->ndo_busy_poll)
6761                 features |= NETIF_F_BUSY_POLL;
6762         else
6763 #endif
6764                 features &= ~NETIF_F_BUSY_POLL;
6765
6766         return features;
6767 }
6768
6769 int __netdev_update_features(struct net_device *dev)
6770 {
6771         struct net_device *upper, *lower;
6772         netdev_features_t features;
6773         struct list_head *iter;
6774         int err = -1;
6775
6776         ASSERT_RTNL();
6777
6778         features = netdev_get_wanted_features(dev);
6779
6780         if (dev->netdev_ops->ndo_fix_features)
6781                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6782
6783         /* driver might be less strict about feature dependencies */
6784         features = netdev_fix_features(dev, features);
6785
6786         /* some features can't be enabled if they're off an an upper device */
6787         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6788                 features = netdev_sync_upper_features(dev, upper, features);
6789
6790         if (dev->features == features)
6791                 goto sync_lower;
6792
6793         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6794                 &dev->features, &features);
6795
6796         if (dev->netdev_ops->ndo_set_features)
6797                 err = dev->netdev_ops->ndo_set_features(dev, features);
6798         else
6799                 err = 0;
6800
6801         if (unlikely(err < 0)) {
6802                 netdev_err(dev,
6803                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6804                         err, &features, &dev->features);
6805                 /* return non-0 since some features might have changed and
6806                  * it's better to fire a spurious notification than miss it
6807                  */
6808                 return -1;
6809         }
6810
6811 sync_lower:
6812         /* some features must be disabled on lower devices when disabled
6813          * on an upper device (think: bonding master or bridge)
6814          */
6815         netdev_for_each_lower_dev(dev, lower, iter)
6816                 netdev_sync_lower_features(dev, lower, features);
6817
6818         if (!err)
6819                 dev->features = features;
6820
6821         return err < 0 ? 0 : 1;
6822 }
6823
6824 /**
6825  *      netdev_update_features - recalculate device features
6826  *      @dev: the device to check
6827  *
6828  *      Recalculate dev->features set and send notifications if it
6829  *      has changed. Should be called after driver or hardware dependent
6830  *      conditions might have changed that influence the features.
6831  */
6832 void netdev_update_features(struct net_device *dev)
6833 {
6834         if (__netdev_update_features(dev))
6835                 netdev_features_change(dev);
6836 }
6837 EXPORT_SYMBOL(netdev_update_features);
6838
6839 /**
6840  *      netdev_change_features - recalculate device features
6841  *      @dev: the device to check
6842  *
6843  *      Recalculate dev->features set and send notifications even
6844  *      if they have not changed. Should be called instead of
6845  *      netdev_update_features() if also dev->vlan_features might
6846  *      have changed to allow the changes to be propagated to stacked
6847  *      VLAN devices.
6848  */
6849 void netdev_change_features(struct net_device *dev)
6850 {
6851         __netdev_update_features(dev);
6852         netdev_features_change(dev);
6853 }
6854 EXPORT_SYMBOL(netdev_change_features);
6855
6856 /**
6857  *      netif_stacked_transfer_operstate -      transfer operstate
6858  *      @rootdev: the root or lower level device to transfer state from
6859  *      @dev: the device to transfer operstate to
6860  *
6861  *      Transfer operational state from root to device. This is normally
6862  *      called when a stacking relationship exists between the root
6863  *      device and the device(a leaf device).
6864  */
6865 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6866                                         struct net_device *dev)
6867 {
6868         if (rootdev->operstate == IF_OPER_DORMANT)
6869                 netif_dormant_on(dev);
6870         else
6871                 netif_dormant_off(dev);
6872
6873         if (netif_carrier_ok(rootdev)) {
6874                 if (!netif_carrier_ok(dev))
6875                         netif_carrier_on(dev);
6876         } else {
6877                 if (netif_carrier_ok(dev))
6878                         netif_carrier_off(dev);
6879         }
6880 }
6881 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6882
6883 #ifdef CONFIG_SYSFS
6884 static int netif_alloc_rx_queues(struct net_device *dev)
6885 {
6886         unsigned int i, count = dev->num_rx_queues;
6887         struct netdev_rx_queue *rx;
6888         size_t sz = count * sizeof(*rx);
6889
6890         BUG_ON(count < 1);
6891
6892         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6893         if (!rx) {
6894                 rx = vzalloc(sz);
6895                 if (!rx)
6896                         return -ENOMEM;
6897         }
6898         dev->_rx = rx;
6899
6900         for (i = 0; i < count; i++)
6901                 rx[i].dev = dev;
6902         return 0;
6903 }
6904 #endif
6905
6906 static void netdev_init_one_queue(struct net_device *dev,
6907                                   struct netdev_queue *queue, void *_unused)
6908 {
6909         /* Initialize queue lock */
6910         spin_lock_init(&queue->_xmit_lock);
6911         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6912         queue->xmit_lock_owner = -1;
6913         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6914         queue->dev = dev;
6915 #ifdef CONFIG_BQL
6916         dql_init(&queue->dql, HZ);
6917 #endif
6918 }
6919
6920 static void netif_free_tx_queues(struct net_device *dev)
6921 {
6922         kvfree(dev->_tx);
6923 }
6924
6925 static int netif_alloc_netdev_queues(struct net_device *dev)
6926 {
6927         unsigned int count = dev->num_tx_queues;
6928         struct netdev_queue *tx;
6929         size_t sz = count * sizeof(*tx);
6930
6931         if (count < 1 || count > 0xffff)
6932                 return -EINVAL;
6933
6934         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6935         if (!tx) {
6936                 tx = vzalloc(sz);
6937                 if (!tx)
6938                         return -ENOMEM;
6939         }
6940         dev->_tx = tx;
6941
6942         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6943         spin_lock_init(&dev->tx_global_lock);
6944
6945         return 0;
6946 }
6947
6948 void netif_tx_stop_all_queues(struct net_device *dev)
6949 {
6950         unsigned int i;
6951
6952         for (i = 0; i < dev->num_tx_queues; i++) {
6953                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6954                 netif_tx_stop_queue(txq);
6955         }
6956 }
6957 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6958
6959 /**
6960  *      register_netdevice      - register a network device
6961  *      @dev: device to register
6962  *
6963  *      Take a completed network device structure and add it to the kernel
6964  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6965  *      chain. 0 is returned on success. A negative errno code is returned
6966  *      on a failure to set up the device, or if the name is a duplicate.
6967  *
6968  *      Callers must hold the rtnl semaphore. You may want
6969  *      register_netdev() instead of this.
6970  *
6971  *      BUGS:
6972  *      The locking appears insufficient to guarantee two parallel registers
6973  *      will not get the same name.
6974  */
6975
6976 int register_netdevice(struct net_device *dev)
6977 {
6978         int ret;
6979         struct net *net = dev_net(dev);
6980
6981         BUG_ON(dev_boot_phase);
6982         ASSERT_RTNL();
6983
6984         might_sleep();
6985
6986         /* When net_device's are persistent, this will be fatal. */
6987         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6988         BUG_ON(!net);
6989
6990         spin_lock_init(&dev->addr_list_lock);
6991         netdev_set_addr_lockdep_class(dev);
6992
6993         ret = dev_get_valid_name(net, dev, dev->name);
6994         if (ret < 0)
6995                 goto out;
6996
6997         /* Init, if this function is available */
6998         if (dev->netdev_ops->ndo_init) {
6999                 ret = dev->netdev_ops->ndo_init(dev);
7000                 if (ret) {
7001                         if (ret > 0)
7002                                 ret = -EIO;
7003                         goto out;
7004                 }
7005         }
7006
7007         if (((dev->hw_features | dev->features) &
7008              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7009             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7010              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7011                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7012                 ret = -EINVAL;
7013                 goto err_uninit;
7014         }
7015
7016         ret = -EBUSY;
7017         if (!dev->ifindex)
7018                 dev->ifindex = dev_new_index(net);
7019         else if (__dev_get_by_index(net, dev->ifindex))
7020                 goto err_uninit;
7021
7022         /* Transfer changeable features to wanted_features and enable
7023          * software offloads (GSO and GRO).
7024          */
7025         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7026         dev->features |= NETIF_F_SOFT_FEATURES;
7027         dev->wanted_features = dev->features & dev->hw_features;
7028
7029         if (!(dev->flags & IFF_LOOPBACK))
7030                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7031
7032         if (dev->hw_features & NETIF_F_TSO)
7033                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7034
7035         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7036          */
7037         dev->vlan_features |= NETIF_F_HIGHDMA;
7038
7039         /* Make NETIF_F_SG inheritable to tunnel devices.
7040          */
7041         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7042
7043         /* Make NETIF_F_SG inheritable to MPLS.
7044          */
7045         dev->mpls_features |= NETIF_F_SG;
7046
7047         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7048         ret = notifier_to_errno(ret);
7049         if (ret)
7050                 goto err_uninit;
7051
7052         ret = netdev_register_kobject(dev);
7053         if (ret)
7054                 goto err_uninit;
7055         dev->reg_state = NETREG_REGISTERED;
7056
7057         __netdev_update_features(dev);
7058
7059         /*
7060          *      Default initial state at registry is that the
7061          *      device is present.
7062          */
7063
7064         set_bit(__LINK_STATE_PRESENT, &dev->state);
7065
7066         linkwatch_init_dev(dev);
7067
7068         dev_init_scheduler(dev);
7069         dev_hold(dev);
7070         list_netdevice(dev);
7071         add_device_randomness(dev->dev_addr, dev->addr_len);
7072
7073         /* If the device has permanent device address, driver should
7074          * set dev_addr and also addr_assign_type should be set to
7075          * NET_ADDR_PERM (default value).
7076          */
7077         if (dev->addr_assign_type == NET_ADDR_PERM)
7078                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7079
7080         /* Notify protocols, that a new device appeared. */
7081         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7082         ret = notifier_to_errno(ret);
7083         if (ret) {
7084                 rollback_registered(dev);
7085                 dev->reg_state = NETREG_UNREGISTERED;
7086         }
7087         /*
7088          *      Prevent userspace races by waiting until the network
7089          *      device is fully setup before sending notifications.
7090          */
7091         if (!dev->rtnl_link_ops ||
7092             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7093                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7094
7095 out:
7096         return ret;
7097
7098 err_uninit:
7099         if (dev->netdev_ops->ndo_uninit)
7100                 dev->netdev_ops->ndo_uninit(dev);
7101         goto out;
7102 }
7103 EXPORT_SYMBOL(register_netdevice);
7104
7105 /**
7106  *      init_dummy_netdev       - init a dummy network device for NAPI
7107  *      @dev: device to init
7108  *
7109  *      This takes a network device structure and initialize the minimum
7110  *      amount of fields so it can be used to schedule NAPI polls without
7111  *      registering a full blown interface. This is to be used by drivers
7112  *      that need to tie several hardware interfaces to a single NAPI
7113  *      poll scheduler due to HW limitations.
7114  */
7115 int init_dummy_netdev(struct net_device *dev)
7116 {
7117         /* Clear everything. Note we don't initialize spinlocks
7118          * are they aren't supposed to be taken by any of the
7119          * NAPI code and this dummy netdev is supposed to be
7120          * only ever used for NAPI polls
7121          */
7122         memset(dev, 0, sizeof(struct net_device));
7123
7124         /* make sure we BUG if trying to hit standard
7125          * register/unregister code path
7126          */
7127         dev->reg_state = NETREG_DUMMY;
7128
7129         /* NAPI wants this */
7130         INIT_LIST_HEAD(&dev->napi_list);
7131
7132         /* a dummy interface is started by default */
7133         set_bit(__LINK_STATE_PRESENT, &dev->state);
7134         set_bit(__LINK_STATE_START, &dev->state);
7135
7136         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7137          * because users of this 'device' dont need to change
7138          * its refcount.
7139          */
7140
7141         return 0;
7142 }
7143 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7144
7145
7146 /**
7147  *      register_netdev - register a network device
7148  *      @dev: device to register
7149  *
7150  *      Take a completed network device structure and add it to the kernel
7151  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7152  *      chain. 0 is returned on success. A negative errno code is returned
7153  *      on a failure to set up the device, or if the name is a duplicate.
7154  *
7155  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7156  *      and expands the device name if you passed a format string to
7157  *      alloc_netdev.
7158  */
7159 int register_netdev(struct net_device *dev)
7160 {
7161         int err;
7162
7163         rtnl_lock();
7164         err = register_netdevice(dev);
7165         rtnl_unlock();
7166         return err;
7167 }
7168 EXPORT_SYMBOL(register_netdev);
7169
7170 int netdev_refcnt_read(const struct net_device *dev)
7171 {
7172         int i, refcnt = 0;
7173
7174         for_each_possible_cpu(i)
7175                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7176         return refcnt;
7177 }
7178 EXPORT_SYMBOL(netdev_refcnt_read);
7179
7180 /**
7181  * netdev_wait_allrefs - wait until all references are gone.
7182  * @dev: target net_device
7183  *
7184  * This is called when unregistering network devices.
7185  *
7186  * Any protocol or device that holds a reference should register
7187  * for netdevice notification, and cleanup and put back the
7188  * reference if they receive an UNREGISTER event.
7189  * We can get stuck here if buggy protocols don't correctly
7190  * call dev_put.
7191  */
7192 static void netdev_wait_allrefs(struct net_device *dev)
7193 {
7194         unsigned long rebroadcast_time, warning_time;
7195         int refcnt;
7196
7197         linkwatch_forget_dev(dev);
7198
7199         rebroadcast_time = warning_time = jiffies;
7200         refcnt = netdev_refcnt_read(dev);
7201
7202         while (refcnt != 0) {
7203                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7204                         rtnl_lock();
7205
7206                         /* Rebroadcast unregister notification */
7207                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7208
7209                         __rtnl_unlock();
7210                         rcu_barrier();
7211                         rtnl_lock();
7212
7213                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7214                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7215                                      &dev->state)) {
7216                                 /* We must not have linkwatch events
7217                                  * pending on unregister. If this
7218                                  * happens, we simply run the queue
7219                                  * unscheduled, resulting in a noop
7220                                  * for this device.
7221                                  */
7222                                 linkwatch_run_queue();
7223                         }
7224
7225                         __rtnl_unlock();
7226
7227                         rebroadcast_time = jiffies;
7228                 }
7229
7230                 msleep(250);
7231
7232                 refcnt = netdev_refcnt_read(dev);
7233
7234                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7235                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7236                                  dev->name, refcnt);
7237                         warning_time = jiffies;
7238                 }
7239         }
7240 }
7241
7242 /* The sequence is:
7243  *
7244  *      rtnl_lock();
7245  *      ...
7246  *      register_netdevice(x1);
7247  *      register_netdevice(x2);
7248  *      ...
7249  *      unregister_netdevice(y1);
7250  *      unregister_netdevice(y2);
7251  *      ...
7252  *      rtnl_unlock();
7253  *      free_netdev(y1);
7254  *      free_netdev(y2);
7255  *
7256  * We are invoked by rtnl_unlock().
7257  * This allows us to deal with problems:
7258  * 1) We can delete sysfs objects which invoke hotplug
7259  *    without deadlocking with linkwatch via keventd.
7260  * 2) Since we run with the RTNL semaphore not held, we can sleep
7261  *    safely in order to wait for the netdev refcnt to drop to zero.
7262  *
7263  * We must not return until all unregister events added during
7264  * the interval the lock was held have been completed.
7265  */
7266 void netdev_run_todo(void)
7267 {
7268         struct list_head list;
7269
7270         /* Snapshot list, allow later requests */
7271         list_replace_init(&net_todo_list, &list);
7272
7273         __rtnl_unlock();
7274
7275
7276         /* Wait for rcu callbacks to finish before next phase */
7277         if (!list_empty(&list))
7278                 rcu_barrier();
7279
7280         while (!list_empty(&list)) {
7281                 struct net_device *dev
7282                         = list_first_entry(&list, struct net_device, todo_list);
7283                 list_del(&dev->todo_list);
7284
7285                 rtnl_lock();
7286                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7287                 __rtnl_unlock();
7288
7289                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7290                         pr_err("network todo '%s' but state %d\n",
7291                                dev->name, dev->reg_state);
7292                         dump_stack();
7293                         continue;
7294                 }
7295
7296                 dev->reg_state = NETREG_UNREGISTERED;
7297
7298                 netdev_wait_allrefs(dev);
7299
7300                 /* paranoia */
7301                 BUG_ON(netdev_refcnt_read(dev));
7302                 BUG_ON(!list_empty(&dev->ptype_all));
7303                 BUG_ON(!list_empty(&dev->ptype_specific));
7304                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7305                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7306                 WARN_ON(dev->dn_ptr);
7307
7308                 if (dev->destructor)
7309                         dev->destructor(dev);
7310
7311                 /* Report a network device has been unregistered */
7312                 rtnl_lock();
7313                 dev_net(dev)->dev_unreg_count--;
7314                 __rtnl_unlock();
7315                 wake_up(&netdev_unregistering_wq);
7316
7317                 /* Free network device */
7318                 kobject_put(&dev->dev.kobj);
7319         }
7320 }
7321
7322 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7323  * all the same fields in the same order as net_device_stats, with only
7324  * the type differing, but rtnl_link_stats64 may have additional fields
7325  * at the end for newer counters.
7326  */
7327 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7328                              const struct net_device_stats *netdev_stats)
7329 {
7330 #if BITS_PER_LONG == 64
7331         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7332         memcpy(stats64, netdev_stats, sizeof(*stats64));
7333         /* zero out counters that only exist in rtnl_link_stats64 */
7334         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7335                sizeof(*stats64) - sizeof(*netdev_stats));
7336 #else
7337         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7338         const unsigned long *src = (const unsigned long *)netdev_stats;
7339         u64 *dst = (u64 *)stats64;
7340
7341         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7342         for (i = 0; i < n; i++)
7343                 dst[i] = src[i];
7344         /* zero out counters that only exist in rtnl_link_stats64 */
7345         memset((char *)stats64 + n * sizeof(u64), 0,
7346                sizeof(*stats64) - n * sizeof(u64));
7347 #endif
7348 }
7349 EXPORT_SYMBOL(netdev_stats_to_stats64);
7350
7351 /**
7352  *      dev_get_stats   - get network device statistics
7353  *      @dev: device to get statistics from
7354  *      @storage: place to store stats
7355  *
7356  *      Get network statistics from device. Return @storage.
7357  *      The device driver may provide its own method by setting
7358  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7359  *      otherwise the internal statistics structure is used.
7360  */
7361 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7362                                         struct rtnl_link_stats64 *storage)
7363 {
7364         const struct net_device_ops *ops = dev->netdev_ops;
7365
7366         if (ops->ndo_get_stats64) {
7367                 memset(storage, 0, sizeof(*storage));
7368                 ops->ndo_get_stats64(dev, storage);
7369         } else if (ops->ndo_get_stats) {
7370                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7371         } else {
7372                 netdev_stats_to_stats64(storage, &dev->stats);
7373         }
7374         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7375         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7376         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7377         return storage;
7378 }
7379 EXPORT_SYMBOL(dev_get_stats);
7380
7381 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7382 {
7383         struct netdev_queue *queue = dev_ingress_queue(dev);
7384
7385 #ifdef CONFIG_NET_CLS_ACT
7386         if (queue)
7387                 return queue;
7388         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7389         if (!queue)
7390                 return NULL;
7391         netdev_init_one_queue(dev, queue, NULL);
7392         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7393         queue->qdisc_sleeping = &noop_qdisc;
7394         rcu_assign_pointer(dev->ingress_queue, queue);
7395 #endif
7396         return queue;
7397 }
7398
7399 static const struct ethtool_ops default_ethtool_ops;
7400
7401 void netdev_set_default_ethtool_ops(struct net_device *dev,
7402                                     const struct ethtool_ops *ops)
7403 {
7404         if (dev->ethtool_ops == &default_ethtool_ops)
7405                 dev->ethtool_ops = ops;
7406 }
7407 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7408
7409 void netdev_freemem(struct net_device *dev)
7410 {
7411         char *addr = (char *)dev - dev->padded;
7412
7413         kvfree(addr);
7414 }
7415
7416 /**
7417  *      alloc_netdev_mqs - allocate network device
7418  *      @sizeof_priv:           size of private data to allocate space for
7419  *      @name:                  device name format string
7420  *      @name_assign_type:      origin of device name
7421  *      @setup:                 callback to initialize device
7422  *      @txqs:                  the number of TX subqueues to allocate
7423  *      @rxqs:                  the number of RX subqueues to allocate
7424  *
7425  *      Allocates a struct net_device with private data area for driver use
7426  *      and performs basic initialization.  Also allocates subqueue structs
7427  *      for each queue on the device.
7428  */
7429 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7430                 unsigned char name_assign_type,
7431                 void (*setup)(struct net_device *),
7432                 unsigned int txqs, unsigned int rxqs)
7433 {
7434         struct net_device *dev;
7435         size_t alloc_size;
7436         struct net_device *p;
7437
7438         BUG_ON(strlen(name) >= sizeof(dev->name));
7439
7440         if (txqs < 1) {
7441                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7442                 return NULL;
7443         }
7444
7445 #ifdef CONFIG_SYSFS
7446         if (rxqs < 1) {
7447                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7448                 return NULL;
7449         }
7450 #endif
7451
7452         alloc_size = sizeof(struct net_device);
7453         if (sizeof_priv) {
7454                 /* ensure 32-byte alignment of private area */
7455                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7456                 alloc_size += sizeof_priv;
7457         }
7458         /* ensure 32-byte alignment of whole construct */
7459         alloc_size += NETDEV_ALIGN - 1;
7460
7461         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7462         if (!p)
7463                 p = vzalloc(alloc_size);
7464         if (!p)
7465                 return NULL;
7466
7467         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7468         dev->padded = (char *)dev - (char *)p;
7469
7470         dev->pcpu_refcnt = alloc_percpu(int);
7471         if (!dev->pcpu_refcnt)
7472                 goto free_dev;
7473
7474         if (dev_addr_init(dev))
7475                 goto free_pcpu;
7476
7477         dev_mc_init(dev);
7478         dev_uc_init(dev);
7479
7480         dev_net_set(dev, &init_net);
7481
7482         dev->gso_max_size = GSO_MAX_SIZE;
7483         dev->gso_max_segs = GSO_MAX_SEGS;
7484
7485         INIT_LIST_HEAD(&dev->napi_list);
7486         INIT_LIST_HEAD(&dev->unreg_list);
7487         INIT_LIST_HEAD(&dev->close_list);
7488         INIT_LIST_HEAD(&dev->link_watch_list);
7489         INIT_LIST_HEAD(&dev->adj_list.upper);
7490         INIT_LIST_HEAD(&dev->adj_list.lower);
7491         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7492         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7493         INIT_LIST_HEAD(&dev->ptype_all);
7494         INIT_LIST_HEAD(&dev->ptype_specific);
7495         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7496         setup(dev);
7497
7498         if (!dev->tx_queue_len) {
7499                 dev->priv_flags |= IFF_NO_QUEUE;
7500                 dev->tx_queue_len = 1;
7501         }
7502
7503         dev->num_tx_queues = txqs;
7504         dev->real_num_tx_queues = txqs;
7505         if (netif_alloc_netdev_queues(dev))
7506                 goto free_all;
7507
7508 #ifdef CONFIG_SYSFS
7509         dev->num_rx_queues = rxqs;
7510         dev->real_num_rx_queues = rxqs;
7511         if (netif_alloc_rx_queues(dev))
7512                 goto free_all;
7513 #endif
7514
7515         strcpy(dev->name, name);
7516         dev->name_assign_type = name_assign_type;
7517         dev->group = INIT_NETDEV_GROUP;
7518         if (!dev->ethtool_ops)
7519                 dev->ethtool_ops = &default_ethtool_ops;
7520
7521         nf_hook_ingress_init(dev);
7522
7523         return dev;
7524
7525 free_all:
7526         free_netdev(dev);
7527         return NULL;
7528
7529 free_pcpu:
7530         free_percpu(dev->pcpu_refcnt);
7531 free_dev:
7532         netdev_freemem(dev);
7533         return NULL;
7534 }
7535 EXPORT_SYMBOL(alloc_netdev_mqs);
7536
7537 /**
7538  *      free_netdev - free network device
7539  *      @dev: device
7540  *
7541  *      This function does the last stage of destroying an allocated device
7542  *      interface. The reference to the device object is released.
7543  *      If this is the last reference then it will be freed.
7544  *      Must be called in process context.
7545  */
7546 void free_netdev(struct net_device *dev)
7547 {
7548         struct napi_struct *p, *n;
7549
7550         might_sleep();
7551         netif_free_tx_queues(dev);
7552 #ifdef CONFIG_SYSFS
7553         kvfree(dev->_rx);
7554 #endif
7555
7556         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7557
7558         /* Flush device addresses */
7559         dev_addr_flush(dev);
7560
7561         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7562                 netif_napi_del(p);
7563
7564         free_percpu(dev->pcpu_refcnt);
7565         dev->pcpu_refcnt = NULL;
7566
7567         /*  Compatibility with error handling in drivers */
7568         if (dev->reg_state == NETREG_UNINITIALIZED) {
7569                 netdev_freemem(dev);
7570                 return;
7571         }
7572
7573         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7574         dev->reg_state = NETREG_RELEASED;
7575
7576         /* will free via device release */
7577         put_device(&dev->dev);
7578 }
7579 EXPORT_SYMBOL(free_netdev);
7580
7581 /**
7582  *      synchronize_net -  Synchronize with packet receive processing
7583  *
7584  *      Wait for packets currently being received to be done.
7585  *      Does not block later packets from starting.
7586  */
7587 void synchronize_net(void)
7588 {
7589         might_sleep();
7590         if (rtnl_is_locked())
7591                 synchronize_rcu_expedited();
7592         else
7593                 synchronize_rcu();
7594 }
7595 EXPORT_SYMBOL(synchronize_net);
7596
7597 /**
7598  *      unregister_netdevice_queue - remove device from the kernel
7599  *      @dev: device
7600  *      @head: list
7601  *
7602  *      This function shuts down a device interface and removes it
7603  *      from the kernel tables.
7604  *      If head not NULL, device is queued to be unregistered later.
7605  *
7606  *      Callers must hold the rtnl semaphore.  You may want
7607  *      unregister_netdev() instead of this.
7608  */
7609
7610 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7611 {
7612         ASSERT_RTNL();
7613
7614         if (head) {
7615                 list_move_tail(&dev->unreg_list, head);
7616         } else {
7617                 rollback_registered(dev);
7618                 /* Finish processing unregister after unlock */
7619                 net_set_todo(dev);
7620         }
7621 }
7622 EXPORT_SYMBOL(unregister_netdevice_queue);
7623
7624 /**
7625  *      unregister_netdevice_many - unregister many devices
7626  *      @head: list of devices
7627  *
7628  *  Note: As most callers use a stack allocated list_head,
7629  *  we force a list_del() to make sure stack wont be corrupted later.
7630  */
7631 void unregister_netdevice_many(struct list_head *head)
7632 {
7633         struct net_device *dev;
7634
7635         if (!list_empty(head)) {
7636                 rollback_registered_many(head);
7637                 list_for_each_entry(dev, head, unreg_list)
7638                         net_set_todo(dev);
7639                 list_del(head);
7640         }
7641 }
7642 EXPORT_SYMBOL(unregister_netdevice_many);
7643
7644 /**
7645  *      unregister_netdev - remove device from the kernel
7646  *      @dev: device
7647  *
7648  *      This function shuts down a device interface and removes it
7649  *      from the kernel tables.
7650  *
7651  *      This is just a wrapper for unregister_netdevice that takes
7652  *      the rtnl semaphore.  In general you want to use this and not
7653  *      unregister_netdevice.
7654  */
7655 void unregister_netdev(struct net_device *dev)
7656 {
7657         rtnl_lock();
7658         unregister_netdevice(dev);
7659         rtnl_unlock();
7660 }
7661 EXPORT_SYMBOL(unregister_netdev);
7662
7663 /**
7664  *      dev_change_net_namespace - move device to different nethost namespace
7665  *      @dev: device
7666  *      @net: network namespace
7667  *      @pat: If not NULL name pattern to try if the current device name
7668  *            is already taken in the destination network namespace.
7669  *
7670  *      This function shuts down a device interface and moves it
7671  *      to a new network namespace. On success 0 is returned, on
7672  *      a failure a netagive errno code is returned.
7673  *
7674  *      Callers must hold the rtnl semaphore.
7675  */
7676
7677 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7678 {
7679         int err;
7680
7681         ASSERT_RTNL();
7682
7683         /* Don't allow namespace local devices to be moved. */
7684         err = -EINVAL;
7685         if (dev->features & NETIF_F_NETNS_LOCAL)
7686                 goto out;
7687
7688         /* Ensure the device has been registrered */
7689         if (dev->reg_state != NETREG_REGISTERED)
7690                 goto out;
7691
7692         /* Get out if there is nothing todo */
7693         err = 0;
7694         if (net_eq(dev_net(dev), net))
7695                 goto out;
7696
7697         /* Pick the destination device name, and ensure
7698          * we can use it in the destination network namespace.
7699          */
7700         err = -EEXIST;
7701         if (__dev_get_by_name(net, dev->name)) {
7702                 /* We get here if we can't use the current device name */
7703                 if (!pat)
7704                         goto out;
7705                 if (dev_get_valid_name(net, dev, pat) < 0)
7706                         goto out;
7707         }
7708
7709         /*
7710          * And now a mini version of register_netdevice unregister_netdevice.
7711          */
7712
7713         /* If device is running close it first. */
7714         dev_close(dev);
7715
7716         /* And unlink it from device chain */
7717         err = -ENODEV;
7718         unlist_netdevice(dev);
7719
7720         synchronize_net();
7721
7722         /* Shutdown queueing discipline. */
7723         dev_shutdown(dev);
7724
7725         /* Notify protocols, that we are about to destroy
7726            this device. They should clean all the things.
7727
7728            Note that dev->reg_state stays at NETREG_REGISTERED.
7729            This is wanted because this way 8021q and macvlan know
7730            the device is just moving and can keep their slaves up.
7731         */
7732         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7733         rcu_barrier();
7734         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7735         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7736
7737         /*
7738          *      Flush the unicast and multicast chains
7739          */
7740         dev_uc_flush(dev);
7741         dev_mc_flush(dev);
7742
7743         /* Send a netdev-removed uevent to the old namespace */
7744         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7745         netdev_adjacent_del_links(dev);
7746
7747         /* Actually switch the network namespace */
7748         dev_net_set(dev, net);
7749
7750         /* If there is an ifindex conflict assign a new one */
7751         if (__dev_get_by_index(net, dev->ifindex))
7752                 dev->ifindex = dev_new_index(net);
7753
7754         /* Send a netdev-add uevent to the new namespace */
7755         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7756         netdev_adjacent_add_links(dev);
7757
7758         /* Fixup kobjects */
7759         err = device_rename(&dev->dev, dev->name);
7760         WARN_ON(err);
7761
7762         /* Add the device back in the hashes */
7763         list_netdevice(dev);
7764
7765         /* Notify protocols, that a new device appeared. */
7766         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7767
7768         /*
7769          *      Prevent userspace races by waiting until the network
7770          *      device is fully setup before sending notifications.
7771          */
7772         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7773
7774         synchronize_net();
7775         err = 0;
7776 out:
7777         return err;
7778 }
7779 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7780
7781 static int dev_cpu_callback(struct notifier_block *nfb,
7782                             unsigned long action,
7783                             void *ocpu)
7784 {
7785         struct sk_buff **list_skb;
7786         struct sk_buff *skb;
7787         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7788         struct softnet_data *sd, *oldsd;
7789
7790         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7791                 return NOTIFY_OK;
7792
7793         local_irq_disable();
7794         cpu = smp_processor_id();
7795         sd = &per_cpu(softnet_data, cpu);
7796         oldsd = &per_cpu(softnet_data, oldcpu);
7797
7798         /* Find end of our completion_queue. */
7799         list_skb = &sd->completion_queue;
7800         while (*list_skb)
7801                 list_skb = &(*list_skb)->next;
7802         /* Append completion queue from offline CPU. */
7803         *list_skb = oldsd->completion_queue;
7804         oldsd->completion_queue = NULL;
7805
7806         /* Append output queue from offline CPU. */
7807         if (oldsd->output_queue) {
7808                 *sd->output_queue_tailp = oldsd->output_queue;
7809                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7810                 oldsd->output_queue = NULL;
7811                 oldsd->output_queue_tailp = &oldsd->output_queue;
7812         }
7813         /* Append NAPI poll list from offline CPU, with one exception :
7814          * process_backlog() must be called by cpu owning percpu backlog.
7815          * We properly handle process_queue & input_pkt_queue later.
7816          */
7817         while (!list_empty(&oldsd->poll_list)) {
7818                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7819                                                             struct napi_struct,
7820                                                             poll_list);
7821
7822                 list_del_init(&napi->poll_list);
7823                 if (napi->poll == process_backlog)
7824                         napi->state = 0;
7825                 else
7826                         ____napi_schedule(sd, napi);
7827         }
7828
7829         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7830         local_irq_enable();
7831
7832         /* Process offline CPU's input_pkt_queue */
7833         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7834                 netif_rx_ni(skb);
7835                 input_queue_head_incr(oldsd);
7836         }
7837         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7838                 netif_rx_ni(skb);
7839                 input_queue_head_incr(oldsd);
7840         }
7841
7842         return NOTIFY_OK;
7843 }
7844
7845
7846 /**
7847  *      netdev_increment_features - increment feature set by one
7848  *      @all: current feature set
7849  *      @one: new feature set
7850  *      @mask: mask feature set
7851  *
7852  *      Computes a new feature set after adding a device with feature set
7853  *      @one to the master device with current feature set @all.  Will not
7854  *      enable anything that is off in @mask. Returns the new feature set.
7855  */
7856 netdev_features_t netdev_increment_features(netdev_features_t all,
7857         netdev_features_t one, netdev_features_t mask)
7858 {
7859         if (mask & NETIF_F_HW_CSUM)
7860                 mask |= NETIF_F_CSUM_MASK;
7861         mask |= NETIF_F_VLAN_CHALLENGED;
7862
7863         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7864         all &= one | ~NETIF_F_ALL_FOR_ALL;
7865
7866         /* If one device supports hw checksumming, set for all. */
7867         if (all & NETIF_F_HW_CSUM)
7868                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7869
7870         return all;
7871 }
7872 EXPORT_SYMBOL(netdev_increment_features);
7873
7874 static struct hlist_head * __net_init netdev_create_hash(void)
7875 {
7876         int i;
7877         struct hlist_head *hash;
7878
7879         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7880         if (hash != NULL)
7881                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7882                         INIT_HLIST_HEAD(&hash[i]);
7883
7884         return hash;
7885 }
7886
7887 /* Initialize per network namespace state */
7888 static int __net_init netdev_init(struct net *net)
7889 {
7890         if (net != &init_net)
7891                 INIT_LIST_HEAD(&net->dev_base_head);
7892
7893         net->dev_name_head = netdev_create_hash();
7894         if (net->dev_name_head == NULL)
7895                 goto err_name;
7896
7897         net->dev_index_head = netdev_create_hash();
7898         if (net->dev_index_head == NULL)
7899                 goto err_idx;
7900
7901         return 0;
7902
7903 err_idx:
7904         kfree(net->dev_name_head);
7905 err_name:
7906         return -ENOMEM;
7907 }
7908
7909 /**
7910  *      netdev_drivername - network driver for the device
7911  *      @dev: network device
7912  *
7913  *      Determine network driver for device.
7914  */
7915 const char *netdev_drivername(const struct net_device *dev)
7916 {
7917         const struct device_driver *driver;
7918         const struct device *parent;
7919         const char *empty = "";
7920
7921         parent = dev->dev.parent;
7922         if (!parent)
7923                 return empty;
7924
7925         driver = parent->driver;
7926         if (driver && driver->name)
7927                 return driver->name;
7928         return empty;
7929 }
7930
7931 static void __netdev_printk(const char *level, const struct net_device *dev,
7932                             struct va_format *vaf)
7933 {
7934         if (dev && dev->dev.parent) {
7935                 dev_printk_emit(level[1] - '0',
7936                                 dev->dev.parent,
7937                                 "%s %s %s%s: %pV",
7938                                 dev_driver_string(dev->dev.parent),
7939                                 dev_name(dev->dev.parent),
7940                                 netdev_name(dev), netdev_reg_state(dev),
7941                                 vaf);
7942         } else if (dev) {
7943                 printk("%s%s%s: %pV",
7944                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7945         } else {
7946                 printk("%s(NULL net_device): %pV", level, vaf);
7947         }
7948 }
7949
7950 void netdev_printk(const char *level, const struct net_device *dev,
7951                    const char *format, ...)
7952 {
7953         struct va_format vaf;
7954         va_list args;
7955
7956         va_start(args, format);
7957
7958         vaf.fmt = format;
7959         vaf.va = &args;
7960
7961         __netdev_printk(level, dev, &vaf);
7962
7963         va_end(args);
7964 }
7965 EXPORT_SYMBOL(netdev_printk);
7966
7967 #define define_netdev_printk_level(func, level)                 \
7968 void func(const struct net_device *dev, const char *fmt, ...)   \
7969 {                                                               \
7970         struct va_format vaf;                                   \
7971         va_list args;                                           \
7972                                                                 \
7973         va_start(args, fmt);                                    \
7974                                                                 \
7975         vaf.fmt = fmt;                                          \
7976         vaf.va = &args;                                         \
7977                                                                 \
7978         __netdev_printk(level, dev, &vaf);                      \
7979                                                                 \
7980         va_end(args);                                           \
7981 }                                                               \
7982 EXPORT_SYMBOL(func);
7983
7984 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7985 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7986 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7987 define_netdev_printk_level(netdev_err, KERN_ERR);
7988 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7989 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7990 define_netdev_printk_level(netdev_info, KERN_INFO);
7991
7992 static void __net_exit netdev_exit(struct net *net)
7993 {
7994         kfree(net->dev_name_head);
7995         kfree(net->dev_index_head);
7996 }
7997
7998 static struct pernet_operations __net_initdata netdev_net_ops = {
7999         .init = netdev_init,
8000         .exit = netdev_exit,
8001 };
8002
8003 static void __net_exit default_device_exit(struct net *net)
8004 {
8005         struct net_device *dev, *aux;
8006         /*
8007          * Push all migratable network devices back to the
8008          * initial network namespace
8009          */
8010         rtnl_lock();
8011         for_each_netdev_safe(net, dev, aux) {
8012                 int err;
8013                 char fb_name[IFNAMSIZ];
8014
8015                 /* Ignore unmoveable devices (i.e. loopback) */
8016                 if (dev->features & NETIF_F_NETNS_LOCAL)
8017                         continue;
8018
8019                 /* Leave virtual devices for the generic cleanup */
8020                 if (dev->rtnl_link_ops)
8021                         continue;
8022
8023                 /* Push remaining network devices to init_net */
8024                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8025                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8026                 if (err) {
8027                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8028                                  __func__, dev->name, err);
8029                         BUG();
8030                 }
8031         }
8032         rtnl_unlock();
8033 }
8034
8035 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8036 {
8037         /* Return with the rtnl_lock held when there are no network
8038          * devices unregistering in any network namespace in net_list.
8039          */
8040         struct net *net;
8041         bool unregistering;
8042         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8043
8044         add_wait_queue(&netdev_unregistering_wq, &wait);
8045         for (;;) {
8046                 unregistering = false;
8047                 rtnl_lock();
8048                 list_for_each_entry(net, net_list, exit_list) {
8049                         if (net->dev_unreg_count > 0) {
8050                                 unregistering = true;
8051                                 break;
8052                         }
8053                 }
8054                 if (!unregistering)
8055                         break;
8056                 __rtnl_unlock();
8057
8058                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8059         }
8060         remove_wait_queue(&netdev_unregistering_wq, &wait);
8061 }
8062
8063 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8064 {
8065         /* At exit all network devices most be removed from a network
8066          * namespace.  Do this in the reverse order of registration.
8067          * Do this across as many network namespaces as possible to
8068          * improve batching efficiency.
8069          */
8070         struct net_device *dev;
8071         struct net *net;
8072         LIST_HEAD(dev_kill_list);
8073
8074         /* To prevent network device cleanup code from dereferencing
8075          * loopback devices or network devices that have been freed
8076          * wait here for all pending unregistrations to complete,
8077          * before unregistring the loopback device and allowing the
8078          * network namespace be freed.
8079          *
8080          * The netdev todo list containing all network devices
8081          * unregistrations that happen in default_device_exit_batch
8082          * will run in the rtnl_unlock() at the end of
8083          * default_device_exit_batch.
8084          */
8085         rtnl_lock_unregistering(net_list);
8086         list_for_each_entry(net, net_list, exit_list) {
8087                 for_each_netdev_reverse(net, dev) {
8088                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8089                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8090                         else
8091                                 unregister_netdevice_queue(dev, &dev_kill_list);
8092                 }
8093         }
8094         unregister_netdevice_many(&dev_kill_list);
8095         rtnl_unlock();
8096 }
8097
8098 static struct pernet_operations __net_initdata default_device_ops = {
8099         .exit = default_device_exit,
8100         .exit_batch = default_device_exit_batch,
8101 };
8102
8103 /*
8104  *      Initialize the DEV module. At boot time this walks the device list and
8105  *      unhooks any devices that fail to initialise (normally hardware not
8106  *      present) and leaves us with a valid list of present and active devices.
8107  *
8108  */
8109
8110 /*
8111  *       This is called single threaded during boot, so no need
8112  *       to take the rtnl semaphore.
8113  */
8114 static int __init net_dev_init(void)
8115 {
8116         int i, rc = -ENOMEM;
8117
8118         BUG_ON(!dev_boot_phase);
8119
8120         if (dev_proc_init())
8121                 goto out;
8122
8123         if (netdev_kobject_init())
8124                 goto out;
8125
8126         INIT_LIST_HEAD(&ptype_all);
8127         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8128                 INIT_LIST_HEAD(&ptype_base[i]);
8129
8130         INIT_LIST_HEAD(&offload_base);
8131
8132         if (register_pernet_subsys(&netdev_net_ops))
8133                 goto out;
8134
8135         /*
8136          *      Initialise the packet receive queues.
8137          */
8138
8139         for_each_possible_cpu(i) {
8140                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8141
8142                 skb_queue_head_init(&sd->input_pkt_queue);
8143                 skb_queue_head_init(&sd->process_queue);
8144                 INIT_LIST_HEAD(&sd->poll_list);
8145                 sd->output_queue_tailp = &sd->output_queue;
8146 #ifdef CONFIG_RPS
8147                 sd->csd.func = rps_trigger_softirq;
8148                 sd->csd.info = sd;
8149                 sd->cpu = i;
8150 #endif
8151
8152                 sd->backlog.poll = process_backlog;
8153                 sd->backlog.weight = weight_p;
8154         }
8155
8156         dev_boot_phase = 0;
8157
8158         /* The loopback device is special if any other network devices
8159          * is present in a network namespace the loopback device must
8160          * be present. Since we now dynamically allocate and free the
8161          * loopback device ensure this invariant is maintained by
8162          * keeping the loopback device as the first device on the
8163          * list of network devices.  Ensuring the loopback devices
8164          * is the first device that appears and the last network device
8165          * that disappears.
8166          */
8167         if (register_pernet_device(&loopback_net_ops))
8168                 goto out;
8169
8170         if (register_pernet_device(&default_device_ops))
8171                 goto out;
8172
8173         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8174         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8175
8176         hotcpu_notifier(dev_cpu_callback, 0);
8177         dst_subsys_init();
8178         rc = 0;
8179 out:
8180         return rc;
8181 }
8182
8183 subsys_initcall(net_dev_init);