2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <asm/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/reciprocal_div.h>
93 #include <net/inet_common.h>
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
112 Incoming, dev->hard_header!=NULL
113 mac_header -> ll header
116 Outgoing, dev->hard_header!=NULL
117 mac_header -> ll header
120 Incoming, dev->hard_header==NULL
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
123 assymetry between rx and tx paths.
126 Outgoing, dev->hard_header==NULL
127 mac_header -> data. ll header is still not built!
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
137 dev->hard_header != NULL
138 mac_header -> ll header
141 dev->hard_header == NULL (ll header is added by device, we cannot control it)
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
149 /* Private packet socket structures. */
151 /* identical to struct packet_mreq except it has
152 * a longer address field.
154 struct packet_mreq_max {
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
168 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
169 int closing, int tx_ring);
171 #define V3_ALIGNMENT (8)
173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175 #define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178 #define PGV_FROM_VMALLOC 1
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
190 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
193 static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
196 static void packet_increment_head(struct packet_ring_buffer *buff);
197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
200 struct packet_sock *);
201 static void prb_retire_current_block(struct tpacket_kbdq_core *,
202 struct packet_sock *, unsigned int status);
203 static int prb_queue_frozen(struct tpacket_kbdq_core *);
204 static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
206 static void prb_retire_rx_blk_timer_expired(unsigned long);
207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208 static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216 static void packet_flush_mclist(struct sock *sk);
218 struct packet_skb_cb {
219 unsigned int origlen;
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
226 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
228 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
229 #define GET_PBLOCK_DESC(x, bid) \
230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
231 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
233 #define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
237 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238 static void __fanout_link(struct sock *sk, struct packet_sock *po);
240 /* register_prot_hook must be invoked with the po->bind_lock held,
241 * or from a context in which asynchronous accesses to the packet
242 * socket is not possible (packet_create()).
244 static void register_prot_hook(struct sock *sk)
246 struct packet_sock *po = pkt_sk(sk);
250 __fanout_link(sk, po);
252 dev_add_pack(&po->prot_hook);
253 rcu_assign_pointer(po->cached_dev, po->prot_hook.dev);
261 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
262 * held. If the sync parameter is true, we will temporarily drop
263 * the po->bind_lock and do a synchronize_net to make sure no
264 * asynchronous packet processing paths still refer to the elements
265 * of po->prot_hook. If the sync parameter is false, it is the
266 * callers responsibility to take care of this.
268 static void __unregister_prot_hook(struct sock *sk, bool sync)
270 struct packet_sock *po = pkt_sk(sk);
274 __fanout_unlink(sk, po);
276 __dev_remove_pack(&po->prot_hook);
277 RCU_INIT_POINTER(po->cached_dev, NULL);
283 spin_unlock(&po->bind_lock);
285 spin_lock(&po->bind_lock);
289 static void unregister_prot_hook(struct sock *sk, bool sync)
291 struct packet_sock *po = pkt_sk(sk);
294 __unregister_prot_hook(sk, sync);
297 static inline __pure struct page *pgv_to_page(void *addr)
299 if (is_vmalloc_addr(addr))
300 return vmalloc_to_page(addr);
301 return virt_to_page(addr);
304 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
306 union tpacket_uhdr h;
309 switch (po->tp_version) {
311 h.h1->tp_status = status;
312 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
315 h.h2->tp_status = status;
316 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
320 WARN(1, "TPACKET version not supported.\n");
327 static int __packet_get_status(struct packet_sock *po, void *frame)
329 union tpacket_uhdr h;
334 switch (po->tp_version) {
336 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
337 return h.h1->tp_status;
339 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
340 return h.h2->tp_status;
343 WARN(1, "TPACKET version not supported.\n");
349 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
352 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
355 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
356 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
357 return TP_STATUS_TS_SYS_HARDWARE;
358 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
359 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
360 return TP_STATUS_TS_RAW_HARDWARE;
363 if (ktime_to_timespec_cond(skb->tstamp, ts))
364 return TP_STATUS_TS_SOFTWARE;
369 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
372 union tpacket_uhdr h;
376 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
380 switch (po->tp_version) {
382 h.h1->tp_sec = ts.tv_sec;
383 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
386 h.h2->tp_sec = ts.tv_sec;
387 h.h2->tp_nsec = ts.tv_nsec;
391 WARN(1, "TPACKET version not supported.\n");
395 /* one flush is safe, as both fields always lie on the same cacheline */
396 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
402 static void *packet_lookup_frame(struct packet_sock *po,
403 struct packet_ring_buffer *rb,
404 unsigned int position,
407 unsigned int pg_vec_pos, frame_offset;
408 union tpacket_uhdr h;
410 pg_vec_pos = position / rb->frames_per_block;
411 frame_offset = position % rb->frames_per_block;
413 h.raw = rb->pg_vec[pg_vec_pos].buffer +
414 (frame_offset * rb->frame_size);
416 if (status != __packet_get_status(po, h.raw))
422 static void *packet_current_frame(struct packet_sock *po,
423 struct packet_ring_buffer *rb,
426 return packet_lookup_frame(po, rb, rb->head, status);
429 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
431 del_timer_sync(&pkc->retire_blk_timer);
434 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
436 struct sk_buff_head *rb_queue)
438 struct tpacket_kbdq_core *pkc;
440 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
442 spin_lock_bh(&rb_queue->lock);
443 pkc->delete_blk_timer = 1;
444 spin_unlock_bh(&rb_queue->lock);
446 prb_del_retire_blk_timer(pkc);
449 static void prb_init_blk_timer(struct packet_sock *po,
450 struct tpacket_kbdq_core *pkc,
451 void (*func) (unsigned long))
453 init_timer(&pkc->retire_blk_timer);
454 pkc->retire_blk_timer.data = (long)po;
455 pkc->retire_blk_timer.function = func;
456 pkc->retire_blk_timer.expires = jiffies;
459 static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
461 struct tpacket_kbdq_core *pkc;
466 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
467 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
470 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
471 int blk_size_in_bytes)
473 struct net_device *dev;
474 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
475 struct ethtool_cmd ecmd;
480 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
481 if (unlikely(!dev)) {
483 return DEFAULT_PRB_RETIRE_TOV;
485 err = __ethtool_get_settings(dev, &ecmd);
486 speed = ethtool_cmd_speed(&ecmd);
490 * If the link speed is so slow you don't really
491 * need to worry about perf anyways
493 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
494 return DEFAULT_PRB_RETIRE_TOV;
501 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
513 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
514 union tpacket_req_u *req_u)
516 p1->feature_req_word = req_u->req3.tp_feature_req_word;
519 static void init_prb_bdqc(struct packet_sock *po,
520 struct packet_ring_buffer *rb,
522 union tpacket_req_u *req_u, int tx_ring)
524 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
525 struct tpacket_block_desc *pbd;
527 memset(p1, 0x0, sizeof(*p1));
529 p1->knxt_seq_num = 1;
531 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
532 p1->pkblk_start = pg_vec[0].buffer;
533 p1->kblk_size = req_u->req3.tp_block_size;
534 p1->knum_blocks = req_u->req3.tp_block_nr;
535 p1->hdrlen = po->tp_hdrlen;
536 p1->version = po->tp_version;
537 p1->last_kactive_blk_num = 0;
538 po->stats.stats3.tp_freeze_q_cnt = 0;
539 if (req_u->req3.tp_retire_blk_tov)
540 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
542 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
543 req_u->req3.tp_block_size);
544 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
545 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
547 prb_init_ft_ops(p1, req_u);
548 prb_setup_retire_blk_timer(po, tx_ring);
549 prb_open_block(p1, pbd);
552 /* Do NOT update the last_blk_num first.
553 * Assumes sk_buff_head lock is held.
555 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
557 mod_timer(&pkc->retire_blk_timer,
558 jiffies + pkc->tov_in_jiffies);
559 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
564 * 1) We refresh the timer only when we open a block.
565 * By doing this we don't waste cycles refreshing the timer
566 * on packet-by-packet basis.
568 * With a 1MB block-size, on a 1Gbps line, it will take
569 * i) ~8 ms to fill a block + ii) memcpy etc.
570 * In this cut we are not accounting for the memcpy time.
572 * So, if the user sets the 'tmo' to 10ms then the timer
573 * will never fire while the block is still getting filled
574 * (which is what we want). However, the user could choose
575 * to close a block early and that's fine.
577 * But when the timer does fire, we check whether or not to refresh it.
578 * Since the tmo granularity is in msecs, it is not too expensive
579 * to refresh the timer, lets say every '8' msecs.
580 * Either the user can set the 'tmo' or we can derive it based on
581 * a) line-speed and b) block-size.
582 * prb_calc_retire_blk_tmo() calculates the tmo.
585 static void prb_retire_rx_blk_timer_expired(unsigned long data)
587 struct packet_sock *po = (struct packet_sock *)data;
588 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
590 struct tpacket_block_desc *pbd;
592 spin_lock(&po->sk.sk_receive_queue.lock);
594 frozen = prb_queue_frozen(pkc);
595 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
597 if (unlikely(pkc->delete_blk_timer))
600 /* We only need to plug the race when the block is partially filled.
602 * lock(); increment BLOCK_NUM_PKTS; unlock()
603 * copy_bits() is in progress ...
604 * timer fires on other cpu:
605 * we can't retire the current block because copy_bits
609 if (BLOCK_NUM_PKTS(pbd)) {
610 while (atomic_read(&pkc->blk_fill_in_prog)) {
611 /* Waiting for skb_copy_bits to finish... */
616 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
618 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
619 if (!prb_dispatch_next_block(pkc, po))
624 /* Case 1. Queue was frozen because user-space was
627 if (prb_curr_blk_in_use(pkc, pbd)) {
629 * Ok, user-space is still behind.
630 * So just refresh the timer.
634 /* Case 2. queue was frozen,user-space caught up,
635 * now the link went idle && the timer fired.
636 * We don't have a block to close.So we open this
637 * block and restart the timer.
638 * opening a block thaws the queue,restarts timer
639 * Thawing/timer-refresh is a side effect.
641 prb_open_block(pkc, pbd);
648 _prb_refresh_rx_retire_blk_timer(pkc);
651 spin_unlock(&po->sk.sk_receive_queue.lock);
654 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
655 struct tpacket_block_desc *pbd1, __u32 status)
657 /* Flush everything minus the block header */
659 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
664 /* Skip the block header(we know header WILL fit in 4K) */
667 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
668 for (; start < end; start += PAGE_SIZE)
669 flush_dcache_page(pgv_to_page(start));
674 /* Now update the block status. */
676 BLOCK_STATUS(pbd1) = status;
678 /* Flush the block header */
680 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
682 flush_dcache_page(pgv_to_page(start));
692 * 2) Increment active_blk_num
694 * Note:We DONT refresh the timer on purpose.
695 * Because almost always the next block will be opened.
697 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
698 struct tpacket_block_desc *pbd1,
699 struct packet_sock *po, unsigned int stat)
701 __u32 status = TP_STATUS_USER | stat;
703 struct tpacket3_hdr *last_pkt;
704 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
706 if (po->stats.stats3.tp_drops)
707 status |= TP_STATUS_LOSING;
709 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
710 last_pkt->tp_next_offset = 0;
712 /* Get the ts of the last pkt */
713 if (BLOCK_NUM_PKTS(pbd1)) {
714 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
715 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
717 /* Ok, we tmo'd - so get the current time */
720 h1->ts_last_pkt.ts_sec = ts.tv_sec;
721 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
726 /* Flush the block */
727 prb_flush_block(pkc1, pbd1, status);
729 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
732 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
734 pkc->reset_pending_on_curr_blk = 0;
738 * Side effect of opening a block:
740 * 1) prb_queue is thawed.
741 * 2) retire_blk_timer is refreshed.
744 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
745 struct tpacket_block_desc *pbd1)
748 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
752 /* We could have just memset this but we will lose the
753 * flexibility of making the priv area sticky
756 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
757 BLOCK_NUM_PKTS(pbd1) = 0;
758 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
762 h1->ts_first_pkt.ts_sec = ts.tv_sec;
763 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
765 pkc1->pkblk_start = (char *)pbd1;
766 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
768 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
769 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
771 pbd1->version = pkc1->version;
772 pkc1->prev = pkc1->nxt_offset;
773 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
775 prb_thaw_queue(pkc1);
776 _prb_refresh_rx_retire_blk_timer(pkc1);
782 * Queue freeze logic:
783 * 1) Assume tp_block_nr = 8 blocks.
784 * 2) At time 't0', user opens Rx ring.
785 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
786 * 4) user-space is either sleeping or processing block '0'.
787 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
788 * it will close block-7,loop around and try to fill block '0'.
790 * __packet_lookup_frame_in_block
791 * prb_retire_current_block()
792 * prb_dispatch_next_block()
793 * |->(BLOCK_STATUS == USER) evaluates to true
794 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
795 * 6) Now there are two cases:
796 * 6.1) Link goes idle right after the queue is frozen.
797 * But remember, the last open_block() refreshed the timer.
798 * When this timer expires,it will refresh itself so that we can
799 * re-open block-0 in near future.
800 * 6.2) Link is busy and keeps on receiving packets. This is a simple
801 * case and __packet_lookup_frame_in_block will check if block-0
802 * is free and can now be re-used.
804 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
805 struct packet_sock *po)
807 pkc->reset_pending_on_curr_blk = 1;
808 po->stats.stats3.tp_freeze_q_cnt++;
811 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
814 * If the next block is free then we will dispatch it
815 * and return a good offset.
816 * Else, we will freeze the queue.
817 * So, caller must check the return value.
819 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
820 struct packet_sock *po)
822 struct tpacket_block_desc *pbd;
826 /* 1. Get current block num */
827 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
829 /* 2. If this block is currently in_use then freeze the queue */
830 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
831 prb_freeze_queue(pkc, po);
837 * open this block and return the offset where the first packet
838 * needs to get stored.
840 prb_open_block(pkc, pbd);
841 return (void *)pkc->nxt_offset;
844 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
845 struct packet_sock *po, unsigned int status)
847 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
849 /* retire/close the current block */
850 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
852 * Plug the case where copy_bits() is in progress on
853 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
854 * have space to copy the pkt in the current block and
855 * called prb_retire_current_block()
857 * We don't need to worry about the TMO case because
858 * the timer-handler already handled this case.
860 if (!(status & TP_STATUS_BLK_TMO)) {
861 while (atomic_read(&pkc->blk_fill_in_prog)) {
862 /* Waiting for skb_copy_bits to finish... */
866 prb_close_block(pkc, pbd, po, status);
871 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
872 struct tpacket_block_desc *pbd)
874 return TP_STATUS_USER & BLOCK_STATUS(pbd);
877 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
879 return pkc->reset_pending_on_curr_blk;
882 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
884 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
885 atomic_dec(&pkc->blk_fill_in_prog);
888 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
889 struct tpacket3_hdr *ppd)
891 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
894 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
895 struct tpacket3_hdr *ppd)
897 ppd->hv1.tp_rxhash = 0;
900 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
901 struct tpacket3_hdr *ppd)
903 if (vlan_tx_tag_present(pkc->skb)) {
904 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
905 ppd->tp_status = TP_STATUS_VLAN_VALID;
907 ppd->hv1.tp_vlan_tci = 0;
908 ppd->tp_status = TP_STATUS_AVAILABLE;
912 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
913 struct tpacket3_hdr *ppd)
915 prb_fill_vlan_info(pkc, ppd);
917 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
918 prb_fill_rxhash(pkc, ppd);
920 prb_clear_rxhash(pkc, ppd);
923 static void prb_fill_curr_block(char *curr,
924 struct tpacket_kbdq_core *pkc,
925 struct tpacket_block_desc *pbd,
928 struct tpacket3_hdr *ppd;
930 ppd = (struct tpacket3_hdr *)curr;
931 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
933 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
934 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
935 BLOCK_NUM_PKTS(pbd) += 1;
936 atomic_inc(&pkc->blk_fill_in_prog);
937 prb_run_all_ft_ops(pkc, ppd);
940 /* Assumes caller has the sk->rx_queue.lock */
941 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
947 struct tpacket_kbdq_core *pkc;
948 struct tpacket_block_desc *pbd;
951 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
952 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
954 /* Queue is frozen when user space is lagging behind */
955 if (prb_queue_frozen(pkc)) {
957 * Check if that last block which caused the queue to freeze,
958 * is still in_use by user-space.
960 if (prb_curr_blk_in_use(pkc, pbd)) {
961 /* Can't record this packet */
965 * Ok, the block was released by user-space.
966 * Now let's open that block.
967 * opening a block also thaws the queue.
968 * Thawing is a side effect.
970 prb_open_block(pkc, pbd);
975 curr = pkc->nxt_offset;
977 end = (char *)pbd + pkc->kblk_size;
979 /* first try the current block */
980 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
981 prb_fill_curr_block(curr, pkc, pbd, len);
985 /* Ok, close the current block */
986 prb_retire_current_block(pkc, po, 0);
988 /* Now, try to dispatch the next block */
989 curr = (char *)prb_dispatch_next_block(pkc, po);
991 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
992 prb_fill_curr_block(curr, pkc, pbd, len);
997 * No free blocks are available.user_space hasn't caught up yet.
998 * Queue was just frozen and now this packet will get dropped.
1003 static void *packet_current_rx_frame(struct packet_sock *po,
1004 struct sk_buff *skb,
1005 int status, unsigned int len)
1008 switch (po->tp_version) {
1011 curr = packet_lookup_frame(po, &po->rx_ring,
1012 po->rx_ring.head, status);
1015 return __packet_lookup_frame_in_block(po, skb, status, len);
1017 WARN(1, "TPACKET version not supported\n");
1023 static void *prb_lookup_block(struct packet_sock *po,
1024 struct packet_ring_buffer *rb,
1028 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1029 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1031 if (status != BLOCK_STATUS(pbd))
1036 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1039 if (rb->prb_bdqc.kactive_blk_num)
1040 prev = rb->prb_bdqc.kactive_blk_num-1;
1042 prev = rb->prb_bdqc.knum_blocks-1;
1046 /* Assumes caller has held the rx_queue.lock */
1047 static void *__prb_previous_block(struct packet_sock *po,
1048 struct packet_ring_buffer *rb,
1051 unsigned int previous = prb_previous_blk_num(rb);
1052 return prb_lookup_block(po, rb, previous, status);
1055 static void *packet_previous_rx_frame(struct packet_sock *po,
1056 struct packet_ring_buffer *rb,
1059 if (po->tp_version <= TPACKET_V2)
1060 return packet_previous_frame(po, rb, status);
1062 return __prb_previous_block(po, rb, status);
1065 static void packet_increment_rx_head(struct packet_sock *po,
1066 struct packet_ring_buffer *rb)
1068 switch (po->tp_version) {
1071 return packet_increment_head(rb);
1074 WARN(1, "TPACKET version not supported.\n");
1080 static void *packet_previous_frame(struct packet_sock *po,
1081 struct packet_ring_buffer *rb,
1084 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1085 return packet_lookup_frame(po, rb, previous, status);
1088 static void packet_increment_head(struct packet_ring_buffer *buff)
1090 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1093 static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1095 struct sock *sk = &po->sk;
1098 if (po->prot_hook.func != tpacket_rcv)
1099 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1102 spin_lock(&sk->sk_receive_queue.lock);
1103 if (po->tp_version == TPACKET_V3)
1104 has_room = prb_lookup_block(po, &po->rx_ring,
1105 po->rx_ring.prb_bdqc.kactive_blk_num,
1108 has_room = packet_lookup_frame(po, &po->rx_ring,
1111 spin_unlock(&sk->sk_receive_queue.lock);
1116 static void packet_sock_destruct(struct sock *sk)
1118 skb_queue_purge(&sk->sk_error_queue);
1120 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1121 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1123 if (!sock_flag(sk, SOCK_DEAD)) {
1124 pr_err("Attempt to release alive packet socket: %p\n", sk);
1128 sk_refcnt_debug_dec(sk);
1131 static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1133 int x = atomic_read(&f->rr_cur) + 1;
1141 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1142 struct sk_buff *skb,
1145 return reciprocal_divide(skb->rxhash, num);
1148 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1149 struct sk_buff *skb,
1154 cur = atomic_read(&f->rr_cur);
1155 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1156 fanout_rr_next(f, num))) != cur)
1161 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1162 struct sk_buff *skb,
1165 return smp_processor_id() % num;
1168 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1169 struct sk_buff *skb,
1172 return reciprocal_divide(prandom_u32(), num);
1175 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1176 struct sk_buff *skb,
1177 unsigned int idx, unsigned int skip,
1182 i = j = min_t(int, f->next[idx], num - 1);
1184 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1196 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1198 return f->flags & (flag >> 8);
1201 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1202 struct packet_type *pt, struct net_device *orig_dev)
1204 struct packet_fanout *f = pt->af_packet_priv;
1205 unsigned int num = f->num_members;
1206 struct packet_sock *po;
1209 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1216 case PACKET_FANOUT_HASH:
1218 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1219 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1223 skb_get_rxhash(skb);
1224 idx = fanout_demux_hash(f, skb, num);
1226 case PACKET_FANOUT_LB:
1227 idx = fanout_demux_lb(f, skb, num);
1229 case PACKET_FANOUT_CPU:
1230 idx = fanout_demux_cpu(f, skb, num);
1232 case PACKET_FANOUT_RND:
1233 idx = fanout_demux_rnd(f, skb, num);
1235 case PACKET_FANOUT_ROLLOVER:
1236 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
1240 po = pkt_sk(f->arr[idx]);
1241 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1242 unlikely(!packet_rcv_has_room(po, skb))) {
1243 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1244 po = pkt_sk(f->arr[idx]);
1247 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1250 DEFINE_MUTEX(fanout_mutex);
1251 EXPORT_SYMBOL_GPL(fanout_mutex);
1252 static LIST_HEAD(fanout_list);
1254 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1256 struct packet_fanout *f = po->fanout;
1258 spin_lock(&f->lock);
1259 f->arr[f->num_members] = sk;
1262 spin_unlock(&f->lock);
1265 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1267 struct packet_fanout *f = po->fanout;
1270 spin_lock(&f->lock);
1271 for (i = 0; i < f->num_members; i++) {
1272 if (f->arr[i] == sk)
1275 BUG_ON(i >= f->num_members);
1276 f->arr[i] = f->arr[f->num_members - 1];
1278 spin_unlock(&f->lock);
1281 static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
1283 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1289 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1291 struct packet_sock *po = pkt_sk(sk);
1292 struct packet_fanout *f, *match;
1293 u8 type = type_flags & 0xff;
1294 u8 flags = type_flags >> 8;
1298 case PACKET_FANOUT_ROLLOVER:
1299 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1301 case PACKET_FANOUT_HASH:
1302 case PACKET_FANOUT_LB:
1303 case PACKET_FANOUT_CPU:
1304 case PACKET_FANOUT_RND:
1316 mutex_lock(&fanout_mutex);
1318 list_for_each_entry(f, &fanout_list, list) {
1320 read_pnet(&f->net) == sock_net(sk)) {
1326 if (match && match->flags != flags)
1330 match = kzalloc(sizeof(*match), GFP_KERNEL);
1333 write_pnet(&match->net, sock_net(sk));
1336 match->flags = flags;
1337 atomic_set(&match->rr_cur, 0);
1338 INIT_LIST_HEAD(&match->list);
1339 spin_lock_init(&match->lock);
1340 atomic_set(&match->sk_ref, 0);
1341 match->prot_hook.type = po->prot_hook.type;
1342 match->prot_hook.dev = po->prot_hook.dev;
1343 match->prot_hook.func = packet_rcv_fanout;
1344 match->prot_hook.af_packet_priv = match;
1345 match->prot_hook.id_match = match_fanout_group;
1346 dev_add_pack(&match->prot_hook);
1347 list_add(&match->list, &fanout_list);
1350 if (match->type == type &&
1351 match->prot_hook.type == po->prot_hook.type &&
1352 match->prot_hook.dev == po->prot_hook.dev) {
1354 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1355 __dev_remove_pack(&po->prot_hook);
1357 atomic_inc(&match->sk_ref);
1358 __fanout_link(sk, po);
1363 mutex_unlock(&fanout_mutex);
1367 static void fanout_release(struct sock *sk)
1369 struct packet_sock *po = pkt_sk(sk);
1370 struct packet_fanout *f;
1376 mutex_lock(&fanout_mutex);
1379 if (atomic_dec_and_test(&f->sk_ref)) {
1381 dev_remove_pack(&f->prot_hook);
1384 mutex_unlock(&fanout_mutex);
1387 static const struct proto_ops packet_ops;
1389 static const struct proto_ops packet_ops_spkt;
1391 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1392 struct packet_type *pt, struct net_device *orig_dev)
1395 struct sockaddr_pkt *spkt;
1398 * When we registered the protocol we saved the socket in the data
1399 * field for just this event.
1402 sk = pt->af_packet_priv;
1405 * Yank back the headers [hope the device set this
1406 * right or kerboom...]
1408 * Incoming packets have ll header pulled,
1411 * For outgoing ones skb->data == skb_mac_header(skb)
1412 * so that this procedure is noop.
1415 if (skb->pkt_type == PACKET_LOOPBACK)
1418 if (!net_eq(dev_net(dev), sock_net(sk)))
1421 skb = skb_share_check(skb, GFP_ATOMIC);
1425 /* drop any routing info */
1428 /* drop conntrack reference */
1431 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1433 skb_push(skb, skb->data - skb_mac_header(skb));
1436 * The SOCK_PACKET socket receives _all_ frames.
1439 spkt->spkt_family = dev->type;
1440 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1441 spkt->spkt_protocol = skb->protocol;
1444 * Charge the memory to the socket. This is done specifically
1445 * to prevent sockets using all the memory up.
1448 if (sock_queue_rcv_skb(sk, skb) == 0)
1459 * Output a raw packet to a device layer. This bypasses all the other
1460 * protocol layers and you must therefore supply it with a complete frame
1463 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1464 struct msghdr *msg, size_t len)
1466 struct sock *sk = sock->sk;
1467 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1468 struct sk_buff *skb = NULL;
1469 struct net_device *dev;
1475 * Get and verify the address.
1479 if (msg->msg_namelen < sizeof(struct sockaddr))
1481 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1482 proto = saddr->spkt_protocol;
1484 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1487 * Find the device first to size check it
1490 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1493 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1499 if (!(dev->flags & IFF_UP))
1503 * You may not queue a frame bigger than the mtu. This is the lowest level
1504 * raw protocol and you must do your own fragmentation at this level.
1507 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1508 if (!netif_supports_nofcs(dev)) {
1509 err = -EPROTONOSUPPORT;
1512 extra_len = 4; /* We're doing our own CRC */
1516 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1520 size_t reserved = LL_RESERVED_SPACE(dev);
1521 int tlen = dev->needed_tailroom;
1522 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1525 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1528 /* FIXME: Save some space for broken drivers that write a hard
1529 * header at transmission time by themselves. PPP is the notable
1530 * one here. This should really be fixed at the driver level.
1532 skb_reserve(skb, reserved);
1533 skb_reset_network_header(skb);
1535 /* Try to align data part correctly */
1540 skb_reset_network_header(skb);
1542 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1548 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
1549 /* Earlier code assumed this would be a VLAN pkt,
1550 * double-check this now that we have the actual
1553 struct ethhdr *ehdr;
1554 skb_reset_mac_header(skb);
1555 ehdr = eth_hdr(skb);
1556 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1562 skb->protocol = proto;
1564 skb->priority = sk->sk_priority;
1565 skb->mark = sk->sk_mark;
1567 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1569 if (unlikely(extra_len == 4))
1572 skb_probe_transport_header(skb, 0);
1574 dev_queue_xmit(skb);
1585 static unsigned int run_filter(const struct sk_buff *skb,
1586 const struct sock *sk,
1589 struct sk_filter *filter;
1592 filter = rcu_dereference(sk->sk_filter);
1594 res = SK_RUN_FILTER(filter, skb);
1601 * This function makes lazy skb cloning in hope that most of packets
1602 * are discarded by BPF.
1604 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1605 * and skb->cb are mangled. It works because (and until) packets
1606 * falling here are owned by current CPU. Output packets are cloned
1607 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1608 * sequencially, so that if we return skb to original state on exit,
1609 * we will not harm anyone.
1612 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1613 struct packet_type *pt, struct net_device *orig_dev)
1616 struct sockaddr_ll *sll;
1617 struct packet_sock *po;
1618 u8 *skb_head = skb->data;
1619 int skb_len = skb->len;
1620 unsigned int snaplen, res;
1622 if (skb->pkt_type == PACKET_LOOPBACK)
1625 sk = pt->af_packet_priv;
1628 if (!net_eq(dev_net(dev), sock_net(sk)))
1633 if (dev->header_ops) {
1634 /* The device has an explicit notion of ll header,
1635 * exported to higher levels.
1637 * Otherwise, the device hides details of its frame
1638 * structure, so that corresponding packet head is
1639 * never delivered to user.
1641 if (sk->sk_type != SOCK_DGRAM)
1642 skb_push(skb, skb->data - skb_mac_header(skb));
1643 else if (skb->pkt_type == PACKET_OUTGOING) {
1644 /* Special case: outgoing packets have ll header at head */
1645 skb_pull(skb, skb_network_offset(skb));
1651 res = run_filter(skb, sk, snaplen);
1653 goto drop_n_restore;
1657 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1660 if (skb_shared(skb)) {
1661 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1665 if (skb_head != skb->data) {
1666 skb->data = skb_head;
1673 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1676 sll = &PACKET_SKB_CB(skb)->sa.ll;
1677 sll->sll_family = AF_PACKET;
1678 sll->sll_hatype = dev->type;
1679 sll->sll_protocol = skb->protocol;
1680 sll->sll_pkttype = skb->pkt_type;
1681 if (unlikely(po->origdev))
1682 sll->sll_ifindex = orig_dev->ifindex;
1684 sll->sll_ifindex = dev->ifindex;
1686 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1688 PACKET_SKB_CB(skb)->origlen = skb->len;
1690 if (pskb_trim(skb, snaplen))
1693 skb_set_owner_r(skb, sk);
1697 /* drop conntrack reference */
1700 spin_lock(&sk->sk_receive_queue.lock);
1701 po->stats.stats1.tp_packets++;
1702 skb->dropcount = atomic_read(&sk->sk_drops);
1703 __skb_queue_tail(&sk->sk_receive_queue, skb);
1704 spin_unlock(&sk->sk_receive_queue.lock);
1705 sk->sk_data_ready(sk, skb->len);
1709 spin_lock(&sk->sk_receive_queue.lock);
1710 po->stats.stats1.tp_drops++;
1711 atomic_inc(&sk->sk_drops);
1712 spin_unlock(&sk->sk_receive_queue.lock);
1715 if (skb_head != skb->data && skb_shared(skb)) {
1716 skb->data = skb_head;
1724 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1725 struct packet_type *pt, struct net_device *orig_dev)
1728 struct packet_sock *po;
1729 struct sockaddr_ll *sll;
1730 union tpacket_uhdr h;
1731 u8 *skb_head = skb->data;
1732 int skb_len = skb->len;
1733 unsigned int snaplen, res;
1734 unsigned long status = TP_STATUS_USER;
1735 unsigned short macoff, netoff, hdrlen;
1736 struct sk_buff *copy_skb = NULL;
1740 if (skb->pkt_type == PACKET_LOOPBACK)
1743 sk = pt->af_packet_priv;
1746 if (!net_eq(dev_net(dev), sock_net(sk)))
1749 if (dev->header_ops) {
1750 if (sk->sk_type != SOCK_DGRAM)
1751 skb_push(skb, skb->data - skb_mac_header(skb));
1752 else if (skb->pkt_type == PACKET_OUTGOING) {
1753 /* Special case: outgoing packets have ll header at head */
1754 skb_pull(skb, skb_network_offset(skb));
1758 if (skb->ip_summed == CHECKSUM_PARTIAL)
1759 status |= TP_STATUS_CSUMNOTREADY;
1763 res = run_filter(skb, sk, snaplen);
1765 goto drop_n_restore;
1769 if (sk->sk_type == SOCK_DGRAM) {
1770 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1773 unsigned int maclen = skb_network_offset(skb);
1774 netoff = TPACKET_ALIGN(po->tp_hdrlen +
1775 (maclen < 16 ? 16 : maclen)) +
1777 macoff = netoff - maclen;
1779 if (po->tp_version <= TPACKET_V2) {
1780 if (macoff + snaplen > po->rx_ring.frame_size) {
1781 if (po->copy_thresh &&
1782 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1783 if (skb_shared(skb)) {
1784 copy_skb = skb_clone(skb, GFP_ATOMIC);
1786 copy_skb = skb_get(skb);
1787 skb_head = skb->data;
1790 skb_set_owner_r(copy_skb, sk);
1792 snaplen = po->rx_ring.frame_size - macoff;
1793 if ((int)snaplen < 0)
1797 spin_lock(&sk->sk_receive_queue.lock);
1798 h.raw = packet_current_rx_frame(po, skb,
1799 TP_STATUS_KERNEL, (macoff+snaplen));
1802 if (po->tp_version <= TPACKET_V2) {
1803 packet_increment_rx_head(po, &po->rx_ring);
1805 * LOSING will be reported till you read the stats,
1806 * because it's COR - Clear On Read.
1807 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1810 if (po->stats.stats1.tp_drops)
1811 status |= TP_STATUS_LOSING;
1813 po->stats.stats1.tp_packets++;
1815 status |= TP_STATUS_COPY;
1816 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1818 spin_unlock(&sk->sk_receive_queue.lock);
1820 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1822 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
1823 getnstimeofday(&ts);
1825 status |= ts_status;
1827 switch (po->tp_version) {
1829 h.h1->tp_len = skb->len;
1830 h.h1->tp_snaplen = snaplen;
1831 h.h1->tp_mac = macoff;
1832 h.h1->tp_net = netoff;
1833 h.h1->tp_sec = ts.tv_sec;
1834 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
1835 hdrlen = sizeof(*h.h1);
1838 h.h2->tp_len = skb->len;
1839 h.h2->tp_snaplen = snaplen;
1840 h.h2->tp_mac = macoff;
1841 h.h2->tp_net = netoff;
1842 h.h2->tp_sec = ts.tv_sec;
1843 h.h2->tp_nsec = ts.tv_nsec;
1844 if (vlan_tx_tag_present(skb)) {
1845 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1846 status |= TP_STATUS_VLAN_VALID;
1848 h.h2->tp_vlan_tci = 0;
1850 h.h2->tp_padding = 0;
1851 hdrlen = sizeof(*h.h2);
1854 /* tp_nxt_offset,vlan are already populated above.
1855 * So DONT clear those fields here
1857 h.h3->tp_status |= status;
1858 h.h3->tp_len = skb->len;
1859 h.h3->tp_snaplen = snaplen;
1860 h.h3->tp_mac = macoff;
1861 h.h3->tp_net = netoff;
1862 h.h3->tp_sec = ts.tv_sec;
1863 h.h3->tp_nsec = ts.tv_nsec;
1864 hdrlen = sizeof(*h.h3);
1870 sll = h.raw + TPACKET_ALIGN(hdrlen);
1871 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1872 sll->sll_family = AF_PACKET;
1873 sll->sll_hatype = dev->type;
1874 sll->sll_protocol = skb->protocol;
1875 sll->sll_pkttype = skb->pkt_type;
1876 if (unlikely(po->origdev))
1877 sll->sll_ifindex = orig_dev->ifindex;
1879 sll->sll_ifindex = dev->ifindex;
1882 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1886 if (po->tp_version <= TPACKET_V2) {
1887 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1888 + macoff + snaplen);
1889 for (start = h.raw; start < end; start += PAGE_SIZE)
1890 flush_dcache_page(pgv_to_page(start));
1895 if (po->tp_version <= TPACKET_V2)
1896 __packet_set_status(po, h.raw, status);
1898 prb_clear_blk_fill_status(&po->rx_ring);
1900 sk->sk_data_ready(sk, 0);
1903 if (skb_head != skb->data && skb_shared(skb)) {
1904 skb->data = skb_head;
1912 po->stats.stats1.tp_drops++;
1913 spin_unlock(&sk->sk_receive_queue.lock);
1915 sk->sk_data_ready(sk, 0);
1916 kfree_skb(copy_skb);
1917 goto drop_n_restore;
1920 static void tpacket_destruct_skb(struct sk_buff *skb)
1922 struct packet_sock *po = pkt_sk(skb->sk);
1925 if (likely(po->tx_ring.pg_vec)) {
1928 ph = skb_shinfo(skb)->destructor_arg;
1929 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1930 atomic_dec(&po->tx_ring.pending);
1932 ts = __packet_set_timestamp(po, ph, skb);
1933 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
1939 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1940 void *frame, struct net_device *dev, int size_max,
1941 __be16 proto, unsigned char *addr, int hlen)
1943 union tpacket_uhdr ph;
1944 int to_write, offset, len, tp_len, nr_frags, len_max;
1945 struct socket *sock = po->sk.sk_socket;
1952 skb->protocol = proto;
1954 skb->priority = po->sk.sk_priority;
1955 skb->mark = po->sk.sk_mark;
1956 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
1957 skb_shinfo(skb)->destructor_arg = ph.raw;
1959 switch (po->tp_version) {
1961 tp_len = ph.h2->tp_len;
1964 tp_len = ph.h1->tp_len;
1967 if (unlikely(tp_len > size_max)) {
1968 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
1972 skb_reserve(skb, hlen);
1973 skb_reset_network_header(skb);
1974 skb_probe_transport_header(skb, 0);
1976 if (po->tp_tx_has_off) {
1977 int off_min, off_max, off;
1978 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1979 off_max = po->tx_ring.frame_size - tp_len;
1980 if (sock->type == SOCK_DGRAM) {
1981 switch (po->tp_version) {
1983 off = ph.h2->tp_net;
1986 off = ph.h1->tp_net;
1990 switch (po->tp_version) {
1992 off = ph.h2->tp_mac;
1995 off = ph.h1->tp_mac;
1999 if (unlikely((off < off_min) || (off_max < off)))
2001 data = ph.raw + off;
2003 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2007 if (sock->type == SOCK_DGRAM) {
2008 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2010 if (unlikely(err < 0))
2012 } else if (dev->hard_header_len) {
2013 /* net device doesn't like empty head */
2014 if (unlikely(tp_len <= dev->hard_header_len)) {
2015 pr_err("packet size is too short (%d < %d)\n",
2016 tp_len, dev->hard_header_len);
2020 skb_push(skb, dev->hard_header_len);
2021 err = skb_store_bits(skb, 0, data,
2022 dev->hard_header_len);
2026 data += dev->hard_header_len;
2027 to_write -= dev->hard_header_len;
2030 offset = offset_in_page(data);
2031 len_max = PAGE_SIZE - offset;
2032 len = ((to_write > len_max) ? len_max : to_write);
2034 skb->data_len = to_write;
2035 skb->len += to_write;
2036 skb->truesize += to_write;
2037 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2039 while (likely(to_write)) {
2040 nr_frags = skb_shinfo(skb)->nr_frags;
2042 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2043 pr_err("Packet exceed the number of skb frags(%lu)\n",
2048 page = pgv_to_page(data);
2050 flush_dcache_page(page);
2052 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2055 len_max = PAGE_SIZE;
2056 len = ((to_write > len_max) ? len_max : to_write);
2062 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
2064 struct net_device *dev;
2067 dev = rcu_dereference(po->cached_dev);
2075 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2077 struct sk_buff *skb;
2078 struct net_device *dev;
2080 int err, reserve = 0;
2082 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
2083 int tp_len, size_max;
2084 unsigned char *addr;
2086 int status = TP_STATUS_AVAILABLE;
2089 mutex_lock(&po->pg_vec_lock);
2091 if (saddr == NULL) {
2092 dev = packet_cached_dev_get(po);
2097 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2099 if (msg->msg_namelen < (saddr->sll_halen
2100 + offsetof(struct sockaddr_ll,
2103 proto = saddr->sll_protocol;
2104 addr = saddr->sll_addr;
2105 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2109 if (unlikely(dev == NULL))
2112 if (unlikely(!(dev->flags & IFF_UP)))
2115 reserve = dev->hard_header_len;
2117 size_max = po->tx_ring.frame_size
2118 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2120 if (size_max > dev->mtu + reserve)
2121 size_max = dev->mtu + reserve;
2124 ph = packet_current_frame(po, &po->tx_ring,
2125 TP_STATUS_SEND_REQUEST);
2127 if (unlikely(ph == NULL)) {
2132 status = TP_STATUS_SEND_REQUEST;
2133 hlen = LL_RESERVED_SPACE(dev);
2134 tlen = dev->needed_tailroom;
2135 skb = sock_alloc_send_skb(&po->sk,
2136 hlen + tlen + sizeof(struct sockaddr_ll),
2139 if (unlikely(skb == NULL))
2142 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2145 if (unlikely(tp_len < 0)) {
2147 __packet_set_status(po, ph,
2148 TP_STATUS_AVAILABLE);
2149 packet_increment_head(&po->tx_ring);
2153 status = TP_STATUS_WRONG_FORMAT;
2159 skb->destructor = tpacket_destruct_skb;
2160 __packet_set_status(po, ph, TP_STATUS_SENDING);
2161 atomic_inc(&po->tx_ring.pending);
2163 status = TP_STATUS_SEND_REQUEST;
2164 err = dev_queue_xmit(skb);
2165 if (unlikely(err > 0)) {
2166 err = net_xmit_errno(err);
2167 if (err && __packet_get_status(po, ph) ==
2168 TP_STATUS_AVAILABLE) {
2169 /* skb was destructed already */
2174 * skb was dropped but not destructed yet;
2175 * let's treat it like congestion or err < 0
2179 packet_increment_head(&po->tx_ring);
2181 } while (likely((ph != NULL) ||
2182 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2183 (atomic_read(&po->tx_ring.pending))))
2190 __packet_set_status(po, ph, status);
2195 mutex_unlock(&po->pg_vec_lock);
2199 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2200 size_t reserve, size_t len,
2201 size_t linear, int noblock,
2204 struct sk_buff *skb;
2206 /* Under a page? Don't bother with paged skb. */
2207 if (prepad + len < PAGE_SIZE || !linear)
2210 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2215 skb_reserve(skb, reserve);
2216 skb_put(skb, linear);
2217 skb->data_len = len - linear;
2218 skb->len += len - linear;
2223 static int packet_snd(struct socket *sock,
2224 struct msghdr *msg, size_t len)
2226 struct sock *sk = sock->sk;
2227 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
2228 struct sk_buff *skb;
2229 struct net_device *dev;
2231 unsigned char *addr;
2232 int err, reserve = 0;
2233 struct virtio_net_hdr vnet_hdr = { 0 };
2236 struct packet_sock *po = pkt_sk(sk);
2237 unsigned short gso_type = 0;
2242 * Get and verify the address.
2245 if (saddr == NULL) {
2246 dev = packet_cached_dev_get(po);
2251 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2253 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2255 proto = saddr->sll_protocol;
2256 addr = saddr->sll_addr;
2257 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2261 if (unlikely(dev == NULL))
2264 if (unlikely(!(dev->flags & IFF_UP)))
2267 if (sock->type == SOCK_RAW)
2268 reserve = dev->hard_header_len;
2269 if (po->has_vnet_hdr) {
2270 vnet_hdr_len = sizeof(vnet_hdr);
2273 if (len < vnet_hdr_len)
2276 len -= vnet_hdr_len;
2278 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2283 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2284 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2286 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2287 vnet_hdr.csum_offset + 2;
2290 if (vnet_hdr.hdr_len > len)
2293 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2294 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2295 case VIRTIO_NET_HDR_GSO_TCPV4:
2296 gso_type = SKB_GSO_TCPV4;
2298 case VIRTIO_NET_HDR_GSO_TCPV6:
2299 gso_type = SKB_GSO_TCPV6;
2301 case VIRTIO_NET_HDR_GSO_UDP:
2302 gso_type = SKB_GSO_UDP;
2308 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2309 gso_type |= SKB_GSO_TCP_ECN;
2311 if (vnet_hdr.gso_size == 0)
2317 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2318 if (!netif_supports_nofcs(dev)) {
2319 err = -EPROTONOSUPPORT;
2322 extra_len = 4; /* We're doing our own CRC */
2326 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2330 hlen = LL_RESERVED_SPACE(dev);
2331 tlen = dev->needed_tailroom;
2332 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
2333 msg->msg_flags & MSG_DONTWAIT, &err);
2337 skb_set_network_header(skb, reserve);
2340 if (sock->type == SOCK_DGRAM &&
2341 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
2344 /* Returns -EFAULT on error */
2345 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
2349 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
2351 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
2352 /* Earlier code assumed this would be a VLAN pkt,
2353 * double-check this now that we have the actual
2356 struct ethhdr *ehdr;
2357 skb_reset_mac_header(skb);
2358 ehdr = eth_hdr(skb);
2359 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2365 skb->protocol = proto;
2367 skb->priority = sk->sk_priority;
2368 skb->mark = sk->sk_mark;
2370 if (po->has_vnet_hdr) {
2371 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2372 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2373 vnet_hdr.csum_offset)) {
2379 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2380 skb_shinfo(skb)->gso_type = gso_type;
2382 /* Header must be checked, and gso_segs computed. */
2383 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2384 skb_shinfo(skb)->gso_segs = 0;
2386 len += vnet_hdr_len;
2389 skb_probe_transport_header(skb, reserve);
2391 if (unlikely(extra_len == 4))
2398 err = dev_queue_xmit(skb);
2399 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2415 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2416 struct msghdr *msg, size_t len)
2418 struct sock *sk = sock->sk;
2419 struct packet_sock *po = pkt_sk(sk);
2420 if (po->tx_ring.pg_vec)
2421 return tpacket_snd(po, msg);
2423 return packet_snd(sock, msg, len);
2427 * Close a PACKET socket. This is fairly simple. We immediately go
2428 * to 'closed' state and remove our protocol entry in the device list.
2431 static int packet_release(struct socket *sock)
2433 struct sock *sk = sock->sk;
2434 struct packet_sock *po;
2436 union tpacket_req_u req_u;
2444 mutex_lock(&net->packet.sklist_lock);
2445 sk_del_node_init_rcu(sk);
2446 mutex_unlock(&net->packet.sklist_lock);
2449 sock_prot_inuse_add(net, sk->sk_prot, -1);
2452 spin_lock(&po->bind_lock);
2453 unregister_prot_hook(sk, false);
2454 if (po->prot_hook.dev) {
2455 dev_put(po->prot_hook.dev);
2456 po->prot_hook.dev = NULL;
2458 spin_unlock(&po->bind_lock);
2460 packet_flush_mclist(sk);
2462 if (po->rx_ring.pg_vec) {
2463 memset(&req_u, 0, sizeof(req_u));
2464 packet_set_ring(sk, &req_u, 1, 0);
2467 if (po->tx_ring.pg_vec) {
2468 memset(&req_u, 0, sizeof(req_u));
2469 packet_set_ring(sk, &req_u, 1, 1);
2476 * Now the socket is dead. No more input will appear.
2483 skb_queue_purge(&sk->sk_receive_queue);
2484 sk_refcnt_debug_release(sk);
2491 * Attach a packet hook.
2494 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
2496 struct packet_sock *po = pkt_sk(sk);
2507 spin_lock(&po->bind_lock);
2508 unregister_prot_hook(sk, true);
2510 po->prot_hook.type = protocol;
2511 if (po->prot_hook.dev)
2512 dev_put(po->prot_hook.dev);
2513 po->prot_hook.dev = dev;
2515 po->ifindex = dev ? dev->ifindex : 0;
2520 if (!dev || (dev->flags & IFF_UP)) {
2521 register_prot_hook(sk);
2523 sk->sk_err = ENETDOWN;
2524 if (!sock_flag(sk, SOCK_DEAD))
2525 sk->sk_error_report(sk);
2529 spin_unlock(&po->bind_lock);
2535 * Bind a packet socket to a device
2538 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2541 struct sock *sk = sock->sk;
2543 struct net_device *dev;
2550 if (addr_len != sizeof(struct sockaddr))
2552 strlcpy(name, uaddr->sa_data, sizeof(name));
2554 dev = dev_get_by_name(sock_net(sk), name);
2556 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
2560 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2562 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2563 struct sock *sk = sock->sk;
2564 struct net_device *dev = NULL;
2572 if (addr_len < sizeof(struct sockaddr_ll))
2574 if (sll->sll_family != AF_PACKET)
2577 if (sll->sll_ifindex) {
2579 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
2583 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
2589 static struct proto packet_proto = {
2591 .owner = THIS_MODULE,
2592 .obj_size = sizeof(struct packet_sock),
2596 * Create a packet of type SOCK_PACKET.
2599 static int packet_create(struct net *net, struct socket *sock, int protocol,
2603 struct packet_sock *po;
2604 __be16 proto = (__force __be16)protocol; /* weird, but documented */
2607 if (!ns_capable(net->user_ns, CAP_NET_RAW))
2609 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2610 sock->type != SOCK_PACKET)
2611 return -ESOCKTNOSUPPORT;
2613 sock->state = SS_UNCONNECTED;
2616 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
2620 sock->ops = &packet_ops;
2621 if (sock->type == SOCK_PACKET)
2622 sock->ops = &packet_ops_spkt;
2624 sock_init_data(sock, sk);
2627 sk->sk_family = PF_PACKET;
2629 RCU_INIT_POINTER(po->cached_dev, NULL);
2631 sk->sk_destruct = packet_sock_destruct;
2632 sk_refcnt_debug_inc(sk);
2635 * Attach a protocol block
2638 spin_lock_init(&po->bind_lock);
2639 mutex_init(&po->pg_vec_lock);
2640 po->prot_hook.func = packet_rcv;
2642 if (sock->type == SOCK_PACKET)
2643 po->prot_hook.func = packet_rcv_spkt;
2645 po->prot_hook.af_packet_priv = sk;
2648 po->prot_hook.type = proto;
2649 register_prot_hook(sk);
2652 mutex_lock(&net->packet.sklist_lock);
2653 sk_add_node_rcu(sk, &net->packet.sklist);
2654 mutex_unlock(&net->packet.sklist_lock);
2657 sock_prot_inuse_add(net, &packet_proto, 1);
2666 * Pull a packet from our receive queue and hand it to the user.
2667 * If necessary we block.
2670 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2671 struct msghdr *msg, size_t len, int flags)
2673 struct sock *sk = sock->sk;
2674 struct sk_buff *skb;
2676 int vnet_hdr_len = 0;
2679 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
2683 /* What error should we return now? EUNATTACH? */
2684 if (pkt_sk(sk)->ifindex < 0)
2688 if (flags & MSG_ERRQUEUE) {
2689 err = sock_recv_errqueue(sk, msg, len,
2690 SOL_PACKET, PACKET_TX_TIMESTAMP);
2695 * Call the generic datagram receiver. This handles all sorts
2696 * of horrible races and re-entrancy so we can forget about it
2697 * in the protocol layers.
2699 * Now it will return ENETDOWN, if device have just gone down,
2700 * but then it will block.
2703 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
2706 * An error occurred so return it. Because skb_recv_datagram()
2707 * handles the blocking we don't see and worry about blocking
2714 if (pkt_sk(sk)->has_vnet_hdr) {
2715 struct virtio_net_hdr vnet_hdr = { 0 };
2718 vnet_hdr_len = sizeof(vnet_hdr);
2719 if (len < vnet_hdr_len)
2722 len -= vnet_hdr_len;
2724 if (skb_is_gso(skb)) {
2725 struct skb_shared_info *sinfo = skb_shinfo(skb);
2727 /* This is a hint as to how much should be linear. */
2728 vnet_hdr.hdr_len = skb_headlen(skb);
2729 vnet_hdr.gso_size = sinfo->gso_size;
2730 if (sinfo->gso_type & SKB_GSO_TCPV4)
2731 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2732 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2733 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2734 else if (sinfo->gso_type & SKB_GSO_UDP)
2735 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2736 else if (sinfo->gso_type & SKB_GSO_FCOE)
2740 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2741 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2743 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2745 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2746 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
2747 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
2748 vnet_hdr.csum_offset = skb->csum_offset;
2749 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2750 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
2751 } /* else everything is zero */
2753 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2759 /* You lose any data beyond the buffer you gave. If it worries
2760 * a user program they can ask the device for its MTU
2766 msg->msg_flags |= MSG_TRUNC;
2769 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2773 sock_recv_ts_and_drops(msg, sk, skb);
2775 if (msg->msg_name) {
2776 /* If the address length field is there to be filled
2777 * in, we fill it in now.
2779 if (sock->type == SOCK_PACKET) {
2780 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2782 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2783 msg->msg_namelen = sll->sll_halen +
2784 offsetof(struct sockaddr_ll, sll_addr);
2786 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2790 if (pkt_sk(sk)->auxdata) {
2791 struct tpacket_auxdata aux;
2793 aux.tp_status = TP_STATUS_USER;
2794 if (skb->ip_summed == CHECKSUM_PARTIAL)
2795 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2796 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2797 aux.tp_snaplen = skb->len;
2799 aux.tp_net = skb_network_offset(skb);
2800 if (vlan_tx_tag_present(skb)) {
2801 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2802 aux.tp_status |= TP_STATUS_VLAN_VALID;
2804 aux.tp_vlan_tci = 0;
2807 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
2811 * Free or return the buffer as appropriate. Again this
2812 * hides all the races and re-entrancy issues from us.
2814 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
2817 skb_free_datagram(sk, skb);
2822 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2823 int *uaddr_len, int peer)
2825 struct net_device *dev;
2826 struct sock *sk = sock->sk;
2831 uaddr->sa_family = AF_PACKET;
2832 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
2834 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2836 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
2838 *uaddr_len = sizeof(*uaddr);
2843 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2844 int *uaddr_len, int peer)
2846 struct net_device *dev;
2847 struct sock *sk = sock->sk;
2848 struct packet_sock *po = pkt_sk(sk);
2849 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
2854 sll->sll_family = AF_PACKET;
2855 sll->sll_ifindex = po->ifindex;
2856 sll->sll_protocol = po->num;
2857 sll->sll_pkttype = 0;
2859 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
2861 sll->sll_hatype = dev->type;
2862 sll->sll_halen = dev->addr_len;
2863 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
2865 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2869 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
2874 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2878 case PACKET_MR_MULTICAST:
2879 if (i->alen != dev->addr_len)
2882 return dev_mc_add(dev, i->addr);
2884 return dev_mc_del(dev, i->addr);
2886 case PACKET_MR_PROMISC:
2887 return dev_set_promiscuity(dev, what);
2889 case PACKET_MR_ALLMULTI:
2890 return dev_set_allmulti(dev, what);
2892 case PACKET_MR_UNICAST:
2893 if (i->alen != dev->addr_len)
2896 return dev_uc_add(dev, i->addr);
2898 return dev_uc_del(dev, i->addr);
2906 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2908 for ( ; i; i = i->next) {
2909 if (i->ifindex == dev->ifindex)
2910 packet_dev_mc(dev, i, what);
2914 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
2916 struct packet_sock *po = pkt_sk(sk);
2917 struct packet_mclist *ml, *i;
2918 struct net_device *dev;
2924 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
2929 if (mreq->mr_alen > dev->addr_len)
2933 i = kmalloc(sizeof(*i), GFP_KERNEL);
2938 for (ml = po->mclist; ml; ml = ml->next) {
2939 if (ml->ifindex == mreq->mr_ifindex &&
2940 ml->type == mreq->mr_type &&
2941 ml->alen == mreq->mr_alen &&
2942 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2944 /* Free the new element ... */
2950 i->type = mreq->mr_type;
2951 i->ifindex = mreq->mr_ifindex;
2952 i->alen = mreq->mr_alen;
2953 memcpy(i->addr, mreq->mr_address, i->alen);
2955 i->next = po->mclist;
2957 err = packet_dev_mc(dev, i, 1);
2959 po->mclist = i->next;
2968 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
2970 struct packet_mclist *ml, **mlp;
2974 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2975 if (ml->ifindex == mreq->mr_ifindex &&
2976 ml->type == mreq->mr_type &&
2977 ml->alen == mreq->mr_alen &&
2978 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2979 if (--ml->count == 0) {
2980 struct net_device *dev;
2982 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2984 packet_dev_mc(dev, ml, -1);
2992 return -EADDRNOTAVAIL;
2995 static void packet_flush_mclist(struct sock *sk)
2997 struct packet_sock *po = pkt_sk(sk);
2998 struct packet_mclist *ml;
3004 while ((ml = po->mclist) != NULL) {
3005 struct net_device *dev;
3007 po->mclist = ml->next;
3008 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3010 packet_dev_mc(dev, ml, -1);
3017 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3019 struct sock *sk = sock->sk;
3020 struct packet_sock *po = pkt_sk(sk);
3023 if (level != SOL_PACKET)
3024 return -ENOPROTOOPT;
3027 case PACKET_ADD_MEMBERSHIP:
3028 case PACKET_DROP_MEMBERSHIP:
3030 struct packet_mreq_max mreq;
3032 memset(&mreq, 0, sizeof(mreq));
3033 if (len < sizeof(struct packet_mreq))
3035 if (len > sizeof(mreq))
3037 if (copy_from_user(&mreq, optval, len))
3039 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3041 if (optname == PACKET_ADD_MEMBERSHIP)
3042 ret = packet_mc_add(sk, &mreq);
3044 ret = packet_mc_drop(sk, &mreq);
3048 case PACKET_RX_RING:
3049 case PACKET_TX_RING:
3051 union tpacket_req_u req_u;
3054 switch (po->tp_version) {
3057 len = sizeof(req_u.req);
3061 len = sizeof(req_u.req3);
3066 if (pkt_sk(sk)->has_vnet_hdr)
3068 if (copy_from_user(&req_u.req, optval, len))
3070 return packet_set_ring(sk, &req_u, 0,
3071 optname == PACKET_TX_RING);
3073 case PACKET_COPY_THRESH:
3077 if (optlen != sizeof(val))
3079 if (copy_from_user(&val, optval, sizeof(val)))
3082 pkt_sk(sk)->copy_thresh = val;
3085 case PACKET_VERSION:
3089 if (optlen != sizeof(val))
3091 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3093 if (copy_from_user(&val, optval, sizeof(val)))
3099 po->tp_version = val;
3105 case PACKET_RESERVE:
3109 if (optlen != sizeof(val))
3111 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3113 if (copy_from_user(&val, optval, sizeof(val)))
3115 po->tp_reserve = val;
3122 if (optlen != sizeof(val))
3124 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3126 if (copy_from_user(&val, optval, sizeof(val)))
3128 po->tp_loss = !!val;
3131 case PACKET_AUXDATA:
3135 if (optlen < sizeof(val))
3137 if (copy_from_user(&val, optval, sizeof(val)))
3140 po->auxdata = !!val;
3143 case PACKET_ORIGDEV:
3147 if (optlen < sizeof(val))
3149 if (copy_from_user(&val, optval, sizeof(val)))
3152 po->origdev = !!val;
3155 case PACKET_VNET_HDR:
3159 if (sock->type != SOCK_RAW)
3161 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3163 if (optlen < sizeof(val))
3165 if (copy_from_user(&val, optval, sizeof(val)))
3168 po->has_vnet_hdr = !!val;
3171 case PACKET_TIMESTAMP:
3175 if (optlen != sizeof(val))
3177 if (copy_from_user(&val, optval, sizeof(val)))
3180 po->tp_tstamp = val;
3187 if (optlen != sizeof(val))
3189 if (copy_from_user(&val, optval, sizeof(val)))
3192 return fanout_add(sk, val & 0xffff, val >> 16);
3194 case PACKET_TX_HAS_OFF:
3198 if (optlen != sizeof(val))
3200 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3202 if (copy_from_user(&val, optval, sizeof(val)))
3204 po->tp_tx_has_off = !!val;
3208 return -ENOPROTOOPT;
3212 static int packet_getsockopt(struct socket *sock, int level, int optname,
3213 char __user *optval, int __user *optlen)
3216 int val, lv = sizeof(val);
3217 struct sock *sk = sock->sk;
3218 struct packet_sock *po = pkt_sk(sk);
3220 union tpacket_stats_u st;
3222 if (level != SOL_PACKET)
3223 return -ENOPROTOOPT;
3225 if (get_user(len, optlen))
3232 case PACKET_STATISTICS:
3233 spin_lock_bh(&sk->sk_receive_queue.lock);
3234 memcpy(&st, &po->stats, sizeof(st));
3235 memset(&po->stats, 0, sizeof(po->stats));
3236 spin_unlock_bh(&sk->sk_receive_queue.lock);
3238 if (po->tp_version == TPACKET_V3) {
3239 lv = sizeof(struct tpacket_stats_v3);
3240 st.stats3.tp_packets += st.stats3.tp_drops;
3243 lv = sizeof(struct tpacket_stats);
3244 st.stats1.tp_packets += st.stats1.tp_drops;
3249 case PACKET_AUXDATA:
3252 case PACKET_ORIGDEV:
3255 case PACKET_VNET_HDR:
3256 val = po->has_vnet_hdr;
3258 case PACKET_VERSION:
3259 val = po->tp_version;
3262 if (len > sizeof(int))
3264 if (copy_from_user(&val, optval, len))
3268 val = sizeof(struct tpacket_hdr);
3271 val = sizeof(struct tpacket2_hdr);
3274 val = sizeof(struct tpacket3_hdr);
3280 case PACKET_RESERVE:
3281 val = po->tp_reserve;
3286 case PACKET_TIMESTAMP:
3287 val = po->tp_tstamp;
3291 ((u32)po->fanout->id |
3292 ((u32)po->fanout->type << 16) |
3293 ((u32)po->fanout->flags << 24)) :
3296 case PACKET_TX_HAS_OFF:
3297 val = po->tp_tx_has_off;
3300 return -ENOPROTOOPT;
3305 if (put_user(len, optlen))
3307 if (copy_to_user(optval, data, len))
3313 static int packet_notifier(struct notifier_block *this,
3314 unsigned long msg, void *ptr)
3317 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3318 struct net *net = dev_net(dev);
3321 sk_for_each_rcu(sk, &net->packet.sklist) {
3322 struct packet_sock *po = pkt_sk(sk);
3325 case NETDEV_UNREGISTER:
3327 packet_dev_mclist(dev, po->mclist, -1);
3331 if (dev->ifindex == po->ifindex) {
3332 spin_lock(&po->bind_lock);
3334 __unregister_prot_hook(sk, false);
3335 sk->sk_err = ENETDOWN;
3336 if (!sock_flag(sk, SOCK_DEAD))
3337 sk->sk_error_report(sk);
3339 if (msg == NETDEV_UNREGISTER) {
3341 if (po->prot_hook.dev)
3342 dev_put(po->prot_hook.dev);
3343 po->prot_hook.dev = NULL;
3345 spin_unlock(&po->bind_lock);
3349 if (dev->ifindex == po->ifindex) {
3350 spin_lock(&po->bind_lock);
3352 register_prot_hook(sk);
3353 spin_unlock(&po->bind_lock);
3363 static int packet_ioctl(struct socket *sock, unsigned int cmd,
3366 struct sock *sk = sock->sk;
3371 int amount = sk_wmem_alloc_get(sk);
3373 return put_user(amount, (int __user *)arg);
3377 struct sk_buff *skb;
3380 spin_lock_bh(&sk->sk_receive_queue.lock);
3381 skb = skb_peek(&sk->sk_receive_queue);
3384 spin_unlock_bh(&sk->sk_receive_queue.lock);
3385 return put_user(amount, (int __user *)arg);
3388 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3390 return sock_get_timestampns(sk, (struct timespec __user *)arg);
3400 case SIOCGIFBRDADDR:
3401 case SIOCSIFBRDADDR:
3402 case SIOCGIFNETMASK:
3403 case SIOCSIFNETMASK:
3404 case SIOCGIFDSTADDR:
3405 case SIOCSIFDSTADDR:
3407 return inet_dgram_ops.ioctl(sock, cmd, arg);
3411 return -ENOIOCTLCMD;
3416 static unsigned int packet_poll(struct file *file, struct socket *sock,
3419 struct sock *sk = sock->sk;
3420 struct packet_sock *po = pkt_sk(sk);
3421 unsigned int mask = datagram_poll(file, sock, wait);
3423 spin_lock_bh(&sk->sk_receive_queue.lock);
3424 if (po->rx_ring.pg_vec) {
3425 if (!packet_previous_rx_frame(po, &po->rx_ring,
3427 mask |= POLLIN | POLLRDNORM;
3429 spin_unlock_bh(&sk->sk_receive_queue.lock);
3430 spin_lock_bh(&sk->sk_write_queue.lock);
3431 if (po->tx_ring.pg_vec) {
3432 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3433 mask |= POLLOUT | POLLWRNORM;
3435 spin_unlock_bh(&sk->sk_write_queue.lock);
3440 /* Dirty? Well, I still did not learn better way to account
3444 static void packet_mm_open(struct vm_area_struct *vma)
3446 struct file *file = vma->vm_file;
3447 struct socket *sock = file->private_data;
3448 struct sock *sk = sock->sk;
3451 atomic_inc(&pkt_sk(sk)->mapped);
3454 static void packet_mm_close(struct vm_area_struct *vma)
3456 struct file *file = vma->vm_file;
3457 struct socket *sock = file->private_data;
3458 struct sock *sk = sock->sk;
3461 atomic_dec(&pkt_sk(sk)->mapped);
3464 static const struct vm_operations_struct packet_mmap_ops = {
3465 .open = packet_mm_open,
3466 .close = packet_mm_close,
3469 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3474 for (i = 0; i < len; i++) {
3475 if (likely(pg_vec[i].buffer)) {
3476 if (is_vmalloc_addr(pg_vec[i].buffer))
3477 vfree(pg_vec[i].buffer);
3479 free_pages((unsigned long)pg_vec[i].buffer,
3481 pg_vec[i].buffer = NULL;
3487 static char *alloc_one_pg_vec_page(unsigned long order)
3489 char *buffer = NULL;
3490 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3491 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3493 buffer = (char *) __get_free_pages(gfp_flags, order);
3499 * __get_free_pages failed, fall back to vmalloc
3501 buffer = vzalloc((1 << order) * PAGE_SIZE);
3507 * vmalloc failed, lets dig into swap here
3509 gfp_flags &= ~__GFP_NORETRY;
3510 buffer = (char *)__get_free_pages(gfp_flags, order);
3515 * complete and utter failure
3520 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
3522 unsigned int block_nr = req->tp_block_nr;
3526 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
3527 if (unlikely(!pg_vec))
3530 for (i = 0; i < block_nr; i++) {
3531 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
3532 if (unlikely(!pg_vec[i].buffer))
3533 goto out_free_pgvec;
3540 free_pg_vec(pg_vec, order, block_nr);
3545 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3546 int closing, int tx_ring)
3548 struct pgv *pg_vec = NULL;
3549 struct packet_sock *po = pkt_sk(sk);
3550 int was_running, order = 0;
3551 struct packet_ring_buffer *rb;
3552 struct sk_buff_head *rb_queue;
3555 /* Added to avoid minimal code churn */
3556 struct tpacket_req *req = &req_u->req;
3558 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3559 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3560 WARN(1, "Tx-ring is not supported.\n");
3564 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3565 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
3569 if (atomic_read(&po->mapped))
3571 if (atomic_read(&rb->pending))
3575 if (req->tp_block_nr) {
3576 /* Sanity tests and some calculations */
3578 if (unlikely(rb->pg_vec))
3581 switch (po->tp_version) {
3583 po->tp_hdrlen = TPACKET_HDRLEN;
3586 po->tp_hdrlen = TPACKET2_HDRLEN;
3589 po->tp_hdrlen = TPACKET3_HDRLEN;
3594 if (unlikely((int)req->tp_block_size <= 0))
3596 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
3598 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
3601 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
3604 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3605 if (unlikely(rb->frames_per_block <= 0))
3607 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3612 order = get_order(req->tp_block_size);
3613 pg_vec = alloc_pg_vec(req, order);
3614 if (unlikely(!pg_vec))
3616 switch (po->tp_version) {
3618 /* Transmit path is not supported. We checked
3619 * it above but just being paranoid
3622 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3631 if (unlikely(req->tp_frame_nr))
3637 /* Detach socket from network */
3638 spin_lock(&po->bind_lock);
3639 was_running = po->running;
3643 __unregister_prot_hook(sk, false);
3645 spin_unlock(&po->bind_lock);
3650 mutex_lock(&po->pg_vec_lock);
3651 if (closing || atomic_read(&po->mapped) == 0) {
3653 spin_lock_bh(&rb_queue->lock);
3654 swap(rb->pg_vec, pg_vec);
3655 rb->frame_max = (req->tp_frame_nr - 1);
3657 rb->frame_size = req->tp_frame_size;
3658 spin_unlock_bh(&rb_queue->lock);
3660 swap(rb->pg_vec_order, order);
3661 swap(rb->pg_vec_len, req->tp_block_nr);
3663 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3664 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3665 tpacket_rcv : packet_rcv;
3666 skb_queue_purge(rb_queue);
3667 if (atomic_read(&po->mapped))
3668 pr_err("packet_mmap: vma is busy: %d\n",
3669 atomic_read(&po->mapped));
3671 mutex_unlock(&po->pg_vec_lock);
3673 spin_lock(&po->bind_lock);
3676 register_prot_hook(sk);
3678 spin_unlock(&po->bind_lock);
3679 if (closing && (po->tp_version > TPACKET_V2)) {
3680 /* Because we don't support block-based V3 on tx-ring */
3682 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3687 free_pg_vec(pg_vec, order, req->tp_block_nr);
3692 static int packet_mmap(struct file *file, struct socket *sock,
3693 struct vm_area_struct *vma)
3695 struct sock *sk = sock->sk;
3696 struct packet_sock *po = pkt_sk(sk);
3697 unsigned long size, expected_size;
3698 struct packet_ring_buffer *rb;
3699 unsigned long start;
3706 mutex_lock(&po->pg_vec_lock);
3709 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3711 expected_size += rb->pg_vec_len
3717 if (expected_size == 0)
3720 size = vma->vm_end - vma->vm_start;
3721 if (size != expected_size)
3724 start = vma->vm_start;
3725 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3726 if (rb->pg_vec == NULL)
3729 for (i = 0; i < rb->pg_vec_len; i++) {
3731 void *kaddr = rb->pg_vec[i].buffer;
3734 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3735 page = pgv_to_page(kaddr);
3736 err = vm_insert_page(vma, start, page);
3745 atomic_inc(&po->mapped);
3746 vma->vm_ops = &packet_mmap_ops;
3750 mutex_unlock(&po->pg_vec_lock);
3754 static const struct proto_ops packet_ops_spkt = {
3755 .family = PF_PACKET,
3756 .owner = THIS_MODULE,
3757 .release = packet_release,
3758 .bind = packet_bind_spkt,
3759 .connect = sock_no_connect,
3760 .socketpair = sock_no_socketpair,
3761 .accept = sock_no_accept,
3762 .getname = packet_getname_spkt,
3763 .poll = datagram_poll,
3764 .ioctl = packet_ioctl,
3765 .listen = sock_no_listen,
3766 .shutdown = sock_no_shutdown,
3767 .setsockopt = sock_no_setsockopt,
3768 .getsockopt = sock_no_getsockopt,
3769 .sendmsg = packet_sendmsg_spkt,
3770 .recvmsg = packet_recvmsg,
3771 .mmap = sock_no_mmap,
3772 .sendpage = sock_no_sendpage,
3775 static const struct proto_ops packet_ops = {
3776 .family = PF_PACKET,
3777 .owner = THIS_MODULE,
3778 .release = packet_release,
3779 .bind = packet_bind,
3780 .connect = sock_no_connect,
3781 .socketpair = sock_no_socketpair,
3782 .accept = sock_no_accept,
3783 .getname = packet_getname,
3784 .poll = packet_poll,
3785 .ioctl = packet_ioctl,
3786 .listen = sock_no_listen,
3787 .shutdown = sock_no_shutdown,
3788 .setsockopt = packet_setsockopt,
3789 .getsockopt = packet_getsockopt,
3790 .sendmsg = packet_sendmsg,
3791 .recvmsg = packet_recvmsg,
3792 .mmap = packet_mmap,
3793 .sendpage = sock_no_sendpage,
3796 static const struct net_proto_family packet_family_ops = {
3797 .family = PF_PACKET,
3798 .create = packet_create,
3799 .owner = THIS_MODULE,
3802 static struct notifier_block packet_netdev_notifier = {
3803 .notifier_call = packet_notifier,
3806 #ifdef CONFIG_PROC_FS
3808 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
3811 struct net *net = seq_file_net(seq);
3814 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
3817 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3819 struct net *net = seq_file_net(seq);
3820 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
3823 static void packet_seq_stop(struct seq_file *seq, void *v)
3829 static int packet_seq_show(struct seq_file *seq, void *v)
3831 if (v == SEQ_START_TOKEN)
3832 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3834 struct sock *s = sk_entry(v);
3835 const struct packet_sock *po = pkt_sk(s);
3838 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
3840 atomic_read(&s->sk_refcnt),
3845 atomic_read(&s->sk_rmem_alloc),
3846 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
3853 static const struct seq_operations packet_seq_ops = {
3854 .start = packet_seq_start,
3855 .next = packet_seq_next,
3856 .stop = packet_seq_stop,
3857 .show = packet_seq_show,
3860 static int packet_seq_open(struct inode *inode, struct file *file)
3862 return seq_open_net(inode, file, &packet_seq_ops,
3863 sizeof(struct seq_net_private));
3866 static const struct file_operations packet_seq_fops = {
3867 .owner = THIS_MODULE,
3868 .open = packet_seq_open,
3870 .llseek = seq_lseek,
3871 .release = seq_release_net,
3876 static int __net_init packet_net_init(struct net *net)
3878 mutex_init(&net->packet.sklist_lock);
3879 INIT_HLIST_HEAD(&net->packet.sklist);
3881 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
3887 static void __net_exit packet_net_exit(struct net *net)
3889 remove_proc_entry("packet", net->proc_net);
3892 static struct pernet_operations packet_net_ops = {
3893 .init = packet_net_init,
3894 .exit = packet_net_exit,
3898 static void __exit packet_exit(void)
3900 unregister_netdevice_notifier(&packet_netdev_notifier);
3901 unregister_pernet_subsys(&packet_net_ops);
3902 sock_unregister(PF_PACKET);
3903 proto_unregister(&packet_proto);
3906 static int __init packet_init(void)
3908 int rc = proto_register(&packet_proto, 0);
3913 sock_register(&packet_family_ops);
3914 register_pernet_subsys(&packet_net_ops);
3915 register_netdevice_notifier(&packet_netdev_notifier);
3920 module_init(packet_init);
3921 module_exit(packet_exit);
3922 MODULE_LICENSE("GPL");
3923 MODULE_ALIAS_NETPROTO(PF_PACKET);