2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
51 #include <linux/types.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <net/net_namespace.h>
67 #include <net/protocol.h>
68 #include <linux/skbuff.h>
70 #include <linux/errno.h>
71 #include <linux/timer.h>
72 #include <asm/system.h>
73 #include <asm/uaccess.h>
74 #include <asm/ioctls.h>
76 #include <asm/cacheflush.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/poll.h>
81 #include <linux/module.h>
82 #include <linux/init.h>
83 #include <linux/mutex.h>
84 #include <linux/if_vlan.h>
85 #include <linux/virtio_net.h>
86 #include <linux/errqueue.h>
87 #include <linux/net_tstamp.h>
90 #include <net/inet_common.h>
95 - if device has no dev->hard_header routine, it adds and removes ll header
96 inside itself. In this case ll header is invisible outside of device,
97 but higher levels still should reserve dev->hard_header_len.
98 Some devices are enough clever to reallocate skb, when header
99 will not fit to reserved space (tunnel), another ones are silly
101 - packet socket receives packets with pulled ll header,
102 so that SOCK_RAW should push it back.
107 Incoming, dev->hard_header!=NULL
108 mac_header -> ll header
111 Outgoing, dev->hard_header!=NULL
112 mac_header -> ll header
115 Incoming, dev->hard_header==NULL
116 mac_header -> UNKNOWN position. It is very likely, that it points to ll
117 header. PPP makes it, that is wrong, because introduce
118 assymetry between rx and tx paths.
121 Outgoing, dev->hard_header==NULL
122 mac_header -> data. ll header is still not built!
126 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132 dev->hard_header != NULL
133 mac_header -> ll header
136 dev->hard_header == NULL (ll header is added by device, we cannot control it)
140 We should set nh.raw on output to correct posistion,
141 packet classifier depends on it.
144 /* Private packet socket structures. */
146 struct packet_mclist {
147 struct packet_mclist *next;
152 unsigned char addr[MAX_ADDR_LEN];
154 /* identical to struct packet_mreq except it has
155 * a longer address field.
157 struct packet_mreq_max {
159 unsigned short mr_type;
160 unsigned short mr_alen;
161 unsigned char mr_address[MAX_ADDR_LEN];
164 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring);
167 #define PGV_FROM_VMALLOC 1
172 struct packet_ring_buffer {
175 unsigned int frames_per_block;
176 unsigned int frame_size;
177 unsigned int frame_max;
179 unsigned int pg_vec_order;
180 unsigned int pg_vec_pages;
181 unsigned int pg_vec_len;
187 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
189 static void packet_flush_mclist(struct sock *sk);
192 /* struct sock has to be the first member of packet_sock */
194 struct tpacket_stats stats;
195 struct packet_ring_buffer rx_ring;
196 struct packet_ring_buffer tx_ring;
198 spinlock_t bind_lock;
199 struct mutex pg_vec_lock;
200 unsigned int running:1, /* prot_hook is attached*/
204 int ifindex; /* bound device */
206 struct packet_mclist *mclist;
208 enum tpacket_versions tp_version;
209 unsigned int tp_hdrlen;
210 unsigned int tp_reserve;
211 unsigned int tp_loss:1;
212 unsigned int tp_tstamp;
213 struct packet_type prot_hook ____cacheline_aligned_in_smp;
216 struct packet_skb_cb {
217 unsigned int origlen;
219 struct sockaddr_pkt pkt;
220 struct sockaddr_ll ll;
224 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
226 static inline __pure struct page *pgv_to_page(void *addr)
228 if (is_vmalloc_addr(addr))
229 return vmalloc_to_page(addr);
230 return virt_to_page(addr);
233 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
236 struct tpacket_hdr *h1;
237 struct tpacket2_hdr *h2;
242 switch (po->tp_version) {
244 h.h1->tp_status = status;
245 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
248 h.h2->tp_status = status;
249 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
252 pr_err("TPACKET version not supported\n");
259 static int __packet_get_status(struct packet_sock *po, void *frame)
262 struct tpacket_hdr *h1;
263 struct tpacket2_hdr *h2;
270 switch (po->tp_version) {
272 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
273 return h.h1->tp_status;
275 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
276 return h.h2->tp_status;
278 pr_err("TPACKET version not supported\n");
284 static void *packet_lookup_frame(struct packet_sock *po,
285 struct packet_ring_buffer *rb,
286 unsigned int position,
289 unsigned int pg_vec_pos, frame_offset;
291 struct tpacket_hdr *h1;
292 struct tpacket2_hdr *h2;
296 pg_vec_pos = position / rb->frames_per_block;
297 frame_offset = position % rb->frames_per_block;
299 h.raw = rb->pg_vec[pg_vec_pos].buffer +
300 (frame_offset * rb->frame_size);
302 if (status != __packet_get_status(po, h.raw))
308 static inline void *packet_current_frame(struct packet_sock *po,
309 struct packet_ring_buffer *rb,
312 return packet_lookup_frame(po, rb, rb->head, status);
315 static inline void *packet_previous_frame(struct packet_sock *po,
316 struct packet_ring_buffer *rb,
319 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
320 return packet_lookup_frame(po, rb, previous, status);
323 static inline void packet_increment_head(struct packet_ring_buffer *buff)
325 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
328 static inline struct packet_sock *pkt_sk(struct sock *sk)
330 return (struct packet_sock *)sk;
333 static void packet_sock_destruct(struct sock *sk)
335 skb_queue_purge(&sk->sk_error_queue);
337 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
338 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
340 if (!sock_flag(sk, SOCK_DEAD)) {
341 pr_err("Attempt to release alive packet socket: %p\n", sk);
345 sk_refcnt_debug_dec(sk);
349 static const struct proto_ops packet_ops;
351 static const struct proto_ops packet_ops_spkt;
353 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
354 struct packet_type *pt, struct net_device *orig_dev)
357 struct sockaddr_pkt *spkt;
360 * When we registered the protocol we saved the socket in the data
361 * field for just this event.
364 sk = pt->af_packet_priv;
367 * Yank back the headers [hope the device set this
368 * right or kerboom...]
370 * Incoming packets have ll header pulled,
373 * For outgoing ones skb->data == skb_mac_header(skb)
374 * so that this procedure is noop.
377 if (skb->pkt_type == PACKET_LOOPBACK)
380 if (!net_eq(dev_net(dev), sock_net(sk)))
383 skb = skb_share_check(skb, GFP_ATOMIC);
387 /* drop any routing info */
390 /* drop conntrack reference */
393 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
395 skb_push(skb, skb->data - skb_mac_header(skb));
398 * The SOCK_PACKET socket receives _all_ frames.
401 spkt->spkt_family = dev->type;
402 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
403 spkt->spkt_protocol = skb->protocol;
406 * Charge the memory to the socket. This is done specifically
407 * to prevent sockets using all the memory up.
410 if (sock_queue_rcv_skb(sk, skb) == 0)
421 * Output a raw packet to a device layer. This bypasses all the other
422 * protocol layers and you must therefore supply it with a complete frame
425 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
426 struct msghdr *msg, size_t len)
428 struct sock *sk = sock->sk;
429 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
430 struct sk_buff *skb = NULL;
431 struct net_device *dev;
436 * Get and verify the address.
440 if (msg->msg_namelen < sizeof(struct sockaddr))
442 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
443 proto = saddr->spkt_protocol;
445 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
448 * Find the device first to size check it
451 saddr->spkt_device[13] = 0;
454 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
460 if (!(dev->flags & IFF_UP))
464 * You may not queue a frame bigger than the mtu. This is the lowest level
465 * raw protocol and you must do your own fragmentation at this level.
469 if (len > dev->mtu + dev->hard_header_len)
473 size_t reserved = LL_RESERVED_SPACE(dev);
474 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
477 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
480 /* FIXME: Save some space for broken drivers that write a hard
481 * header at transmission time by themselves. PPP is the notable
482 * one here. This should really be fixed at the driver level.
484 skb_reserve(skb, reserved);
485 skb_reset_network_header(skb);
487 /* Try to align data part correctly */
492 skb_reset_network_header(skb);
494 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
501 skb->protocol = proto;
503 skb->priority = sk->sk_priority;
504 skb->mark = sk->sk_mark;
505 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
520 static inline unsigned int run_filter(const struct sk_buff *skb,
521 const struct sock *sk,
524 struct sk_filter *filter;
527 filter = rcu_dereference_bh(sk->sk_filter);
529 res = sk_run_filter(skb, filter->insns);
530 rcu_read_unlock_bh();
536 * This function makes lazy skb cloning in hope that most of packets
537 * are discarded by BPF.
539 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
540 * and skb->cb are mangled. It works because (and until) packets
541 * falling here are owned by current CPU. Output packets are cloned
542 * by dev_queue_xmit_nit(), input packets are processed by net_bh
543 * sequencially, so that if we return skb to original state on exit,
544 * we will not harm anyone.
547 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
548 struct packet_type *pt, struct net_device *orig_dev)
551 struct sockaddr_ll *sll;
552 struct packet_sock *po;
553 u8 *skb_head = skb->data;
554 int skb_len = skb->len;
555 unsigned int snaplen, res;
557 if (skb->pkt_type == PACKET_LOOPBACK)
560 sk = pt->af_packet_priv;
563 if (!net_eq(dev_net(dev), sock_net(sk)))
568 if (dev->header_ops) {
569 /* The device has an explicit notion of ll header,
570 * exported to higher levels.
572 * Otherwise, the device hides details of its frame
573 * structure, so that corresponding packet head is
574 * never delivered to user.
576 if (sk->sk_type != SOCK_DGRAM)
577 skb_push(skb, skb->data - skb_mac_header(skb));
578 else if (skb->pkt_type == PACKET_OUTGOING) {
579 /* Special case: outgoing packets have ll header at head */
580 skb_pull(skb, skb_network_offset(skb));
586 res = run_filter(skb, sk, snaplen);
592 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
593 (unsigned)sk->sk_rcvbuf)
596 if (skb_shared(skb)) {
597 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
601 if (skb_head != skb->data) {
602 skb->data = skb_head;
609 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
612 sll = &PACKET_SKB_CB(skb)->sa.ll;
613 sll->sll_family = AF_PACKET;
614 sll->sll_hatype = dev->type;
615 sll->sll_protocol = skb->protocol;
616 sll->sll_pkttype = skb->pkt_type;
617 if (unlikely(po->origdev))
618 sll->sll_ifindex = orig_dev->ifindex;
620 sll->sll_ifindex = dev->ifindex;
622 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
624 PACKET_SKB_CB(skb)->origlen = skb->len;
626 if (pskb_trim(skb, snaplen))
629 skb_set_owner_r(skb, sk);
633 /* drop conntrack reference */
636 spin_lock(&sk->sk_receive_queue.lock);
637 po->stats.tp_packets++;
638 skb->dropcount = atomic_read(&sk->sk_drops);
639 __skb_queue_tail(&sk->sk_receive_queue, skb);
640 spin_unlock(&sk->sk_receive_queue.lock);
641 sk->sk_data_ready(sk, skb->len);
645 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
648 if (skb_head != skb->data && skb_shared(skb)) {
649 skb->data = skb_head;
657 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
658 struct packet_type *pt, struct net_device *orig_dev)
661 struct packet_sock *po;
662 struct sockaddr_ll *sll;
664 struct tpacket_hdr *h1;
665 struct tpacket2_hdr *h2;
668 u8 *skb_head = skb->data;
669 int skb_len = skb->len;
670 unsigned int snaplen, res;
671 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
672 unsigned short macoff, netoff, hdrlen;
673 struct sk_buff *copy_skb = NULL;
676 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
678 if (skb->pkt_type == PACKET_LOOPBACK)
681 sk = pt->af_packet_priv;
684 if (!net_eq(dev_net(dev), sock_net(sk)))
687 if (dev->header_ops) {
688 if (sk->sk_type != SOCK_DGRAM)
689 skb_push(skb, skb->data - skb_mac_header(skb));
690 else if (skb->pkt_type == PACKET_OUTGOING) {
691 /* Special case: outgoing packets have ll header at head */
692 skb_pull(skb, skb_network_offset(skb));
696 if (skb->ip_summed == CHECKSUM_PARTIAL)
697 status |= TP_STATUS_CSUMNOTREADY;
701 res = run_filter(skb, sk, snaplen);
707 if (sk->sk_type == SOCK_DGRAM) {
708 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
711 unsigned maclen = skb_network_offset(skb);
712 netoff = TPACKET_ALIGN(po->tp_hdrlen +
713 (maclen < 16 ? 16 : maclen)) +
715 macoff = netoff - maclen;
718 if (macoff + snaplen > po->rx_ring.frame_size) {
719 if (po->copy_thresh &&
720 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
721 (unsigned)sk->sk_rcvbuf) {
722 if (skb_shared(skb)) {
723 copy_skb = skb_clone(skb, GFP_ATOMIC);
725 copy_skb = skb_get(skb);
726 skb_head = skb->data;
729 skb_set_owner_r(copy_skb, sk);
731 snaplen = po->rx_ring.frame_size - macoff;
732 if ((int)snaplen < 0)
736 spin_lock(&sk->sk_receive_queue.lock);
737 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
740 packet_increment_head(&po->rx_ring);
741 po->stats.tp_packets++;
743 status |= TP_STATUS_COPY;
744 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
746 if (!po->stats.tp_drops)
747 status &= ~TP_STATUS_LOSING;
748 spin_unlock(&sk->sk_receive_queue.lock);
750 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
752 switch (po->tp_version) {
754 h.h1->tp_len = skb->len;
755 h.h1->tp_snaplen = snaplen;
756 h.h1->tp_mac = macoff;
757 h.h1->tp_net = netoff;
758 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
759 && shhwtstamps->syststamp.tv64)
760 tv = ktime_to_timeval(shhwtstamps->syststamp);
761 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
762 && shhwtstamps->hwtstamp.tv64)
763 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
764 else if (skb->tstamp.tv64)
765 tv = ktime_to_timeval(skb->tstamp);
767 do_gettimeofday(&tv);
768 h.h1->tp_sec = tv.tv_sec;
769 h.h1->tp_usec = tv.tv_usec;
770 hdrlen = sizeof(*h.h1);
773 h.h2->tp_len = skb->len;
774 h.h2->tp_snaplen = snaplen;
775 h.h2->tp_mac = macoff;
776 h.h2->tp_net = netoff;
777 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
778 && shhwtstamps->syststamp.tv64)
779 ts = ktime_to_timespec(shhwtstamps->syststamp);
780 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
781 && shhwtstamps->hwtstamp.tv64)
782 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
783 else if (skb->tstamp.tv64)
784 ts = ktime_to_timespec(skb->tstamp);
787 h.h2->tp_sec = ts.tv_sec;
788 h.h2->tp_nsec = ts.tv_nsec;
789 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
790 hdrlen = sizeof(*h.h2);
796 sll = h.raw + TPACKET_ALIGN(hdrlen);
797 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
798 sll->sll_family = AF_PACKET;
799 sll->sll_hatype = dev->type;
800 sll->sll_protocol = skb->protocol;
801 sll->sll_pkttype = skb->pkt_type;
802 if (unlikely(po->origdev))
803 sll->sll_ifindex = orig_dev->ifindex;
805 sll->sll_ifindex = dev->ifindex;
807 __packet_set_status(po, h.raw, status);
809 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
813 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
814 for (start = h.raw; start < end; start += PAGE_SIZE)
815 flush_dcache_page(pgv_to_page(start));
819 sk->sk_data_ready(sk, 0);
822 if (skb_head != skb->data && skb_shared(skb)) {
823 skb->data = skb_head;
831 po->stats.tp_drops++;
832 spin_unlock(&sk->sk_receive_queue.lock);
834 sk->sk_data_ready(sk, 0);
839 static void tpacket_destruct_skb(struct sk_buff *skb)
841 struct packet_sock *po = pkt_sk(skb->sk);
846 if (likely(po->tx_ring.pg_vec)) {
847 ph = skb_shinfo(skb)->destructor_arg;
848 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
849 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
850 atomic_dec(&po->tx_ring.pending);
851 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
857 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
858 void *frame, struct net_device *dev, int size_max,
859 __be16 proto, unsigned char *addr)
862 struct tpacket_hdr *h1;
863 struct tpacket2_hdr *h2;
866 int to_write, offset, len, tp_len, nr_frags, len_max;
867 struct socket *sock = po->sk.sk_socket;
874 skb->protocol = proto;
876 skb->priority = po->sk.sk_priority;
877 skb->mark = po->sk.sk_mark;
878 skb_shinfo(skb)->destructor_arg = ph.raw;
880 switch (po->tp_version) {
882 tp_len = ph.h2->tp_len;
885 tp_len = ph.h1->tp_len;
888 if (unlikely(tp_len > size_max)) {
889 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
893 skb_reserve(skb, LL_RESERVED_SPACE(dev));
894 skb_reset_network_header(skb);
896 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
899 if (sock->type == SOCK_DGRAM) {
900 err = dev_hard_header(skb, dev, ntohs(proto), addr,
902 if (unlikely(err < 0))
904 } else if (dev->hard_header_len) {
905 /* net device doesn't like empty head */
906 if (unlikely(tp_len <= dev->hard_header_len)) {
907 pr_err("packet size is too short (%d < %d)\n",
908 tp_len, dev->hard_header_len);
912 skb_push(skb, dev->hard_header_len);
913 err = skb_store_bits(skb, 0, data,
914 dev->hard_header_len);
918 data += dev->hard_header_len;
919 to_write -= dev->hard_header_len;
923 offset = offset_in_page(data);
924 len_max = PAGE_SIZE - offset;
925 len = ((to_write > len_max) ? len_max : to_write);
927 skb->data_len = to_write;
928 skb->len += to_write;
929 skb->truesize += to_write;
930 atomic_add(to_write, &po->sk.sk_wmem_alloc);
932 while (likely(to_write)) {
933 nr_frags = skb_shinfo(skb)->nr_frags;
935 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
936 pr_err("Packet exceed the number of skb frags(%lu)\n",
941 page = pgv_to_page(data);
943 flush_dcache_page(page);
945 skb_fill_page_desc(skb, nr_frags, page, offset, len);
949 len = ((to_write > len_max) ? len_max : to_write);
955 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
959 struct net_device *dev;
961 int ifindex, err, reserve = 0;
963 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
964 int tp_len, size_max;
969 sock = po->sk.sk_socket;
971 mutex_lock(&po->pg_vec_lock);
975 ifindex = po->ifindex;
980 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
982 if (msg->msg_namelen < (saddr->sll_halen
983 + offsetof(struct sockaddr_ll,
986 ifindex = saddr->sll_ifindex;
987 proto = saddr->sll_protocol;
988 addr = saddr->sll_addr;
991 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
993 if (unlikely(dev == NULL))
996 reserve = dev->hard_header_len;
999 if (unlikely(!(dev->flags & IFF_UP)))
1002 size_max = po->tx_ring.frame_size
1003 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
1005 if (size_max > dev->mtu + reserve)
1006 size_max = dev->mtu + reserve;
1009 ph = packet_current_frame(po, &po->tx_ring,
1010 TP_STATUS_SEND_REQUEST);
1012 if (unlikely(ph == NULL)) {
1017 status = TP_STATUS_SEND_REQUEST;
1018 skb = sock_alloc_send_skb(&po->sk,
1019 LL_ALLOCATED_SPACE(dev)
1020 + sizeof(struct sockaddr_ll),
1023 if (unlikely(skb == NULL))
1026 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1029 if (unlikely(tp_len < 0)) {
1031 __packet_set_status(po, ph,
1032 TP_STATUS_AVAILABLE);
1033 packet_increment_head(&po->tx_ring);
1037 status = TP_STATUS_WRONG_FORMAT;
1043 skb->destructor = tpacket_destruct_skb;
1044 __packet_set_status(po, ph, TP_STATUS_SENDING);
1045 atomic_inc(&po->tx_ring.pending);
1047 status = TP_STATUS_SEND_REQUEST;
1048 err = dev_queue_xmit(skb);
1049 if (unlikely(err > 0)) {
1050 err = net_xmit_errno(err);
1051 if (err && __packet_get_status(po, ph) ==
1052 TP_STATUS_AVAILABLE) {
1053 /* skb was destructed already */
1058 * skb was dropped but not destructed yet;
1059 * let's treat it like congestion or err < 0
1063 packet_increment_head(&po->tx_ring);
1065 } while (likely((ph != NULL) ||
1066 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1067 (atomic_read(&po->tx_ring.pending))))
1074 __packet_set_status(po, ph, status);
1079 mutex_unlock(&po->pg_vec_lock);
1083 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1084 size_t reserve, size_t len,
1085 size_t linear, int noblock,
1088 struct sk_buff *skb;
1090 /* Under a page? Don't bother with paged skb. */
1091 if (prepad + len < PAGE_SIZE || !linear)
1094 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1099 skb_reserve(skb, reserve);
1100 skb_put(skb, linear);
1101 skb->data_len = len - linear;
1102 skb->len += len - linear;
1107 static int packet_snd(struct socket *sock,
1108 struct msghdr *msg, size_t len)
1110 struct sock *sk = sock->sk;
1111 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1112 struct sk_buff *skb;
1113 struct net_device *dev;
1115 unsigned char *addr;
1116 int ifindex, err, reserve = 0;
1117 struct virtio_net_hdr vnet_hdr = { 0 };
1120 struct packet_sock *po = pkt_sk(sk);
1121 unsigned short gso_type = 0;
1124 * Get and verify the address.
1127 if (saddr == NULL) {
1128 ifindex = po->ifindex;
1133 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1135 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1137 ifindex = saddr->sll_ifindex;
1138 proto = saddr->sll_protocol;
1139 addr = saddr->sll_addr;
1143 dev = dev_get_by_index(sock_net(sk), ifindex);
1147 if (sock->type == SOCK_RAW)
1148 reserve = dev->hard_header_len;
1151 if (!(dev->flags & IFF_UP))
1154 if (po->has_vnet_hdr) {
1155 vnet_hdr_len = sizeof(vnet_hdr);
1158 if (len < vnet_hdr_len)
1161 len -= vnet_hdr_len;
1163 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1168 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1169 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1171 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1172 vnet_hdr.csum_offset + 2;
1175 if (vnet_hdr.hdr_len > len)
1178 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1179 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1180 case VIRTIO_NET_HDR_GSO_TCPV4:
1181 gso_type = SKB_GSO_TCPV4;
1183 case VIRTIO_NET_HDR_GSO_TCPV6:
1184 gso_type = SKB_GSO_TCPV6;
1186 case VIRTIO_NET_HDR_GSO_UDP:
1187 gso_type = SKB_GSO_UDP;
1193 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1194 gso_type |= SKB_GSO_TCP_ECN;
1196 if (vnet_hdr.gso_size == 0)
1203 if (!gso_type && (len > dev->mtu+reserve))
1207 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1208 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1209 msg->msg_flags & MSG_DONTWAIT, &err);
1213 skb_set_network_header(skb, reserve);
1216 if (sock->type == SOCK_DGRAM &&
1217 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1220 /* Returns -EFAULT on error */
1221 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1224 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1228 skb->protocol = proto;
1230 skb->priority = sk->sk_priority;
1231 skb->mark = sk->sk_mark;
1233 if (po->has_vnet_hdr) {
1234 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1235 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1236 vnet_hdr.csum_offset)) {
1242 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1243 skb_shinfo(skb)->gso_type = gso_type;
1245 /* Header must be checked, and gso_segs computed. */
1246 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1247 skb_shinfo(skb)->gso_segs = 0;
1249 len += vnet_hdr_len;
1256 err = dev_queue_xmit(skb);
1257 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1273 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1274 struct msghdr *msg, size_t len)
1276 struct sock *sk = sock->sk;
1277 struct packet_sock *po = pkt_sk(sk);
1278 if (po->tx_ring.pg_vec)
1279 return tpacket_snd(po, msg);
1281 return packet_snd(sock, msg, len);
1285 * Close a PACKET socket. This is fairly simple. We immediately go
1286 * to 'closed' state and remove our protocol entry in the device list.
1289 static int packet_release(struct socket *sock)
1291 struct sock *sk = sock->sk;
1292 struct packet_sock *po;
1294 struct tpacket_req req;
1302 spin_lock_bh(&net->packet.sklist_lock);
1303 sk_del_node_init_rcu(sk);
1304 sock_prot_inuse_add(net, sk->sk_prot, -1);
1305 spin_unlock_bh(&net->packet.sklist_lock);
1307 spin_lock(&po->bind_lock);
1310 * Remove from protocol table
1314 __dev_remove_pack(&po->prot_hook);
1317 spin_unlock(&po->bind_lock);
1319 packet_flush_mclist(sk);
1321 memset(&req, 0, sizeof(req));
1323 if (po->rx_ring.pg_vec)
1324 packet_set_ring(sk, &req, 1, 0);
1326 if (po->tx_ring.pg_vec)
1327 packet_set_ring(sk, &req, 1, 1);
1331 * Now the socket is dead. No more input will appear.
1338 skb_queue_purge(&sk->sk_receive_queue);
1339 sk_refcnt_debug_release(sk);
1346 * Attach a packet hook.
1349 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1351 struct packet_sock *po = pkt_sk(sk);
1353 * Detach an existing hook if present.
1358 spin_lock(&po->bind_lock);
1363 spin_unlock(&po->bind_lock);
1364 dev_remove_pack(&po->prot_hook);
1365 spin_lock(&po->bind_lock);
1369 po->prot_hook.type = protocol;
1370 po->prot_hook.dev = dev;
1372 po->ifindex = dev ? dev->ifindex : 0;
1377 if (!dev || (dev->flags & IFF_UP)) {
1378 dev_add_pack(&po->prot_hook);
1382 sk->sk_err = ENETDOWN;
1383 if (!sock_flag(sk, SOCK_DEAD))
1384 sk->sk_error_report(sk);
1388 spin_unlock(&po->bind_lock);
1394 * Bind a packet socket to a device
1397 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1400 struct sock *sk = sock->sk;
1402 struct net_device *dev;
1409 if (addr_len != sizeof(struct sockaddr))
1411 strlcpy(name, uaddr->sa_data, sizeof(name));
1413 dev = dev_get_by_name(sock_net(sk), name);
1415 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1421 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1423 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1424 struct sock *sk = sock->sk;
1425 struct net_device *dev = NULL;
1433 if (addr_len < sizeof(struct sockaddr_ll))
1435 if (sll->sll_family != AF_PACKET)
1438 if (sll->sll_ifindex) {
1440 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1444 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1452 static struct proto packet_proto = {
1454 .owner = THIS_MODULE,
1455 .obj_size = sizeof(struct packet_sock),
1459 * Create a packet of type SOCK_PACKET.
1462 static int packet_create(struct net *net, struct socket *sock, int protocol,
1466 struct packet_sock *po;
1467 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1470 if (!capable(CAP_NET_RAW))
1472 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1473 sock->type != SOCK_PACKET)
1474 return -ESOCKTNOSUPPORT;
1476 sock->state = SS_UNCONNECTED;
1479 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1483 sock->ops = &packet_ops;
1484 if (sock->type == SOCK_PACKET)
1485 sock->ops = &packet_ops_spkt;
1487 sock_init_data(sock, sk);
1490 sk->sk_family = PF_PACKET;
1493 sk->sk_destruct = packet_sock_destruct;
1494 sk_refcnt_debug_inc(sk);
1497 * Attach a protocol block
1500 spin_lock_init(&po->bind_lock);
1501 mutex_init(&po->pg_vec_lock);
1502 po->prot_hook.func = packet_rcv;
1504 if (sock->type == SOCK_PACKET)
1505 po->prot_hook.func = packet_rcv_spkt;
1507 po->prot_hook.af_packet_priv = sk;
1510 po->prot_hook.type = proto;
1511 dev_add_pack(&po->prot_hook);
1516 spin_lock_bh(&net->packet.sklist_lock);
1517 sk_add_node_rcu(sk, &net->packet.sklist);
1518 sock_prot_inuse_add(net, &packet_proto, 1);
1519 spin_unlock_bh(&net->packet.sklist_lock);
1526 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1528 struct sock_exterr_skb *serr;
1529 struct sk_buff *skb, *skb2;
1533 skb = skb_dequeue(&sk->sk_error_queue);
1539 msg->msg_flags |= MSG_TRUNC;
1542 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1546 sock_recv_timestamp(msg, sk, skb);
1548 serr = SKB_EXT_ERR(skb);
1549 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1550 sizeof(serr->ee), &serr->ee);
1552 msg->msg_flags |= MSG_ERRQUEUE;
1555 /* Reset and regenerate socket error */
1556 spin_lock_bh(&sk->sk_error_queue.lock);
1558 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1559 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1560 spin_unlock_bh(&sk->sk_error_queue.lock);
1561 sk->sk_error_report(sk);
1563 spin_unlock_bh(&sk->sk_error_queue.lock);
1572 * Pull a packet from our receive queue and hand it to the user.
1573 * If necessary we block.
1576 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1577 struct msghdr *msg, size_t len, int flags)
1579 struct sock *sk = sock->sk;
1580 struct sk_buff *skb;
1582 struct sockaddr_ll *sll;
1583 int vnet_hdr_len = 0;
1586 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1590 /* What error should we return now? EUNATTACH? */
1591 if (pkt_sk(sk)->ifindex < 0)
1595 if (flags & MSG_ERRQUEUE) {
1596 err = packet_recv_error(sk, msg, len);
1601 * Call the generic datagram receiver. This handles all sorts
1602 * of horrible races and re-entrancy so we can forget about it
1603 * in the protocol layers.
1605 * Now it will return ENETDOWN, if device have just gone down,
1606 * but then it will block.
1609 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1612 * An error occurred so return it. Because skb_recv_datagram()
1613 * handles the blocking we don't see and worry about blocking
1620 if (pkt_sk(sk)->has_vnet_hdr) {
1621 struct virtio_net_hdr vnet_hdr = { 0 };
1624 vnet_hdr_len = sizeof(vnet_hdr);
1625 if (len < vnet_hdr_len)
1628 len -= vnet_hdr_len;
1630 if (skb_is_gso(skb)) {
1631 struct skb_shared_info *sinfo = skb_shinfo(skb);
1633 /* This is a hint as to how much should be linear. */
1634 vnet_hdr.hdr_len = skb_headlen(skb);
1635 vnet_hdr.gso_size = sinfo->gso_size;
1636 if (sinfo->gso_type & SKB_GSO_TCPV4)
1637 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1638 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1639 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1640 else if (sinfo->gso_type & SKB_GSO_UDP)
1641 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1642 else if (sinfo->gso_type & SKB_GSO_FCOE)
1646 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1647 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1649 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1651 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1652 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1653 vnet_hdr.csum_start = skb->csum_start -
1655 vnet_hdr.csum_offset = skb->csum_offset;
1656 } /* else everything is zero */
1658 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1665 * If the address length field is there to be filled in, we fill
1669 sll = &PACKET_SKB_CB(skb)->sa.ll;
1670 if (sock->type == SOCK_PACKET)
1671 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1673 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1676 * You lose any data beyond the buffer you gave. If it worries a
1677 * user program they can ask the device for its MTU anyway.
1683 msg->msg_flags |= MSG_TRUNC;
1686 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1690 sock_recv_ts_and_drops(msg, sk, skb);
1693 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1696 if (pkt_sk(sk)->auxdata) {
1697 struct tpacket_auxdata aux;
1699 aux.tp_status = TP_STATUS_USER;
1700 if (skb->ip_summed == CHECKSUM_PARTIAL)
1701 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1702 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1703 aux.tp_snaplen = skb->len;
1705 aux.tp_net = skb_network_offset(skb);
1706 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1708 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1712 * Free or return the buffer as appropriate. Again this
1713 * hides all the races and re-entrancy issues from us.
1715 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1718 skb_free_datagram(sk, skb);
1723 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1724 int *uaddr_len, int peer)
1726 struct net_device *dev;
1727 struct sock *sk = sock->sk;
1732 uaddr->sa_family = AF_PACKET;
1734 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1736 strncpy(uaddr->sa_data, dev->name, 14);
1738 memset(uaddr->sa_data, 0, 14);
1740 *uaddr_len = sizeof(*uaddr);
1745 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1746 int *uaddr_len, int peer)
1748 struct net_device *dev;
1749 struct sock *sk = sock->sk;
1750 struct packet_sock *po = pkt_sk(sk);
1751 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1756 sll->sll_family = AF_PACKET;
1757 sll->sll_ifindex = po->ifindex;
1758 sll->sll_protocol = po->num;
1759 sll->sll_pkttype = 0;
1761 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1763 sll->sll_hatype = dev->type;
1764 sll->sll_halen = dev->addr_len;
1765 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1767 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1771 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1776 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1780 case PACKET_MR_MULTICAST:
1781 if (i->alen != dev->addr_len)
1784 return dev_mc_add(dev, i->addr);
1786 return dev_mc_del(dev, i->addr);
1788 case PACKET_MR_PROMISC:
1789 return dev_set_promiscuity(dev, what);
1791 case PACKET_MR_ALLMULTI:
1792 return dev_set_allmulti(dev, what);
1794 case PACKET_MR_UNICAST:
1795 if (i->alen != dev->addr_len)
1798 return dev_uc_add(dev, i->addr);
1800 return dev_uc_del(dev, i->addr);
1808 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1810 for ( ; i; i = i->next) {
1811 if (i->ifindex == dev->ifindex)
1812 packet_dev_mc(dev, i, what);
1816 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1818 struct packet_sock *po = pkt_sk(sk);
1819 struct packet_mclist *ml, *i;
1820 struct net_device *dev;
1826 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1831 if (mreq->mr_alen > dev->addr_len)
1835 i = kmalloc(sizeof(*i), GFP_KERNEL);
1840 for (ml = po->mclist; ml; ml = ml->next) {
1841 if (ml->ifindex == mreq->mr_ifindex &&
1842 ml->type == mreq->mr_type &&
1843 ml->alen == mreq->mr_alen &&
1844 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1846 /* Free the new element ... */
1852 i->type = mreq->mr_type;
1853 i->ifindex = mreq->mr_ifindex;
1854 i->alen = mreq->mr_alen;
1855 memcpy(i->addr, mreq->mr_address, i->alen);
1857 i->next = po->mclist;
1859 err = packet_dev_mc(dev, i, 1);
1861 po->mclist = i->next;
1870 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1872 struct packet_mclist *ml, **mlp;
1876 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1877 if (ml->ifindex == mreq->mr_ifindex &&
1878 ml->type == mreq->mr_type &&
1879 ml->alen == mreq->mr_alen &&
1880 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1881 if (--ml->count == 0) {
1882 struct net_device *dev;
1884 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1886 packet_dev_mc(dev, ml, -1);
1894 return -EADDRNOTAVAIL;
1897 static void packet_flush_mclist(struct sock *sk)
1899 struct packet_sock *po = pkt_sk(sk);
1900 struct packet_mclist *ml;
1906 while ((ml = po->mclist) != NULL) {
1907 struct net_device *dev;
1909 po->mclist = ml->next;
1910 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1912 packet_dev_mc(dev, ml, -1);
1919 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1921 struct sock *sk = sock->sk;
1922 struct packet_sock *po = pkt_sk(sk);
1925 if (level != SOL_PACKET)
1926 return -ENOPROTOOPT;
1929 case PACKET_ADD_MEMBERSHIP:
1930 case PACKET_DROP_MEMBERSHIP:
1932 struct packet_mreq_max mreq;
1934 memset(&mreq, 0, sizeof(mreq));
1935 if (len < sizeof(struct packet_mreq))
1937 if (len > sizeof(mreq))
1939 if (copy_from_user(&mreq, optval, len))
1941 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1943 if (optname == PACKET_ADD_MEMBERSHIP)
1944 ret = packet_mc_add(sk, &mreq);
1946 ret = packet_mc_drop(sk, &mreq);
1950 case PACKET_RX_RING:
1951 case PACKET_TX_RING:
1953 struct tpacket_req req;
1955 if (optlen < sizeof(req))
1957 if (pkt_sk(sk)->has_vnet_hdr)
1959 if (copy_from_user(&req, optval, sizeof(req)))
1961 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1963 case PACKET_COPY_THRESH:
1967 if (optlen != sizeof(val))
1969 if (copy_from_user(&val, optval, sizeof(val)))
1972 pkt_sk(sk)->copy_thresh = val;
1975 case PACKET_VERSION:
1979 if (optlen != sizeof(val))
1981 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1983 if (copy_from_user(&val, optval, sizeof(val)))
1988 po->tp_version = val;
1994 case PACKET_RESERVE:
1998 if (optlen != sizeof(val))
2000 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2002 if (copy_from_user(&val, optval, sizeof(val)))
2004 po->tp_reserve = val;
2011 if (optlen != sizeof(val))
2013 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2015 if (copy_from_user(&val, optval, sizeof(val)))
2017 po->tp_loss = !!val;
2020 case PACKET_AUXDATA:
2024 if (optlen < sizeof(val))
2026 if (copy_from_user(&val, optval, sizeof(val)))
2029 po->auxdata = !!val;
2032 case PACKET_ORIGDEV:
2036 if (optlen < sizeof(val))
2038 if (copy_from_user(&val, optval, sizeof(val)))
2041 po->origdev = !!val;
2044 case PACKET_VNET_HDR:
2048 if (sock->type != SOCK_RAW)
2050 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2052 if (optlen < sizeof(val))
2054 if (copy_from_user(&val, optval, sizeof(val)))
2057 po->has_vnet_hdr = !!val;
2060 case PACKET_TIMESTAMP:
2064 if (optlen != sizeof(val))
2066 if (copy_from_user(&val, optval, sizeof(val)))
2069 po->tp_tstamp = val;
2073 return -ENOPROTOOPT;
2077 static int packet_getsockopt(struct socket *sock, int level, int optname,
2078 char __user *optval, int __user *optlen)
2082 struct sock *sk = sock->sk;
2083 struct packet_sock *po = pkt_sk(sk);
2085 struct tpacket_stats st;
2087 if (level != SOL_PACKET)
2088 return -ENOPROTOOPT;
2090 if (get_user(len, optlen))
2097 case PACKET_STATISTICS:
2098 if (len > sizeof(struct tpacket_stats))
2099 len = sizeof(struct tpacket_stats);
2100 spin_lock_bh(&sk->sk_receive_queue.lock);
2102 memset(&po->stats, 0, sizeof(st));
2103 spin_unlock_bh(&sk->sk_receive_queue.lock);
2104 st.tp_packets += st.tp_drops;
2108 case PACKET_AUXDATA:
2109 if (len > sizeof(int))
2115 case PACKET_ORIGDEV:
2116 if (len > sizeof(int))
2122 case PACKET_VNET_HDR:
2123 if (len > sizeof(int))
2125 val = po->has_vnet_hdr;
2129 case PACKET_VERSION:
2130 if (len > sizeof(int))
2132 val = po->tp_version;
2136 if (len > sizeof(int))
2138 if (copy_from_user(&val, optval, len))
2142 val = sizeof(struct tpacket_hdr);
2145 val = sizeof(struct tpacket2_hdr);
2152 case PACKET_RESERVE:
2153 if (len > sizeof(unsigned int))
2154 len = sizeof(unsigned int);
2155 val = po->tp_reserve;
2159 if (len > sizeof(unsigned int))
2160 len = sizeof(unsigned int);
2164 case PACKET_TIMESTAMP:
2165 if (len > sizeof(int))
2167 val = po->tp_tstamp;
2171 return -ENOPROTOOPT;
2174 if (put_user(len, optlen))
2176 if (copy_to_user(optval, data, len))
2182 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2185 struct hlist_node *node;
2186 struct net_device *dev = data;
2187 struct net *net = dev_net(dev);
2190 sk_for_each_rcu(sk, node, &net->packet.sklist) {
2191 struct packet_sock *po = pkt_sk(sk);
2194 case NETDEV_UNREGISTER:
2196 packet_dev_mclist(dev, po->mclist, -1);
2200 if (dev->ifindex == po->ifindex) {
2201 spin_lock(&po->bind_lock);
2203 __dev_remove_pack(&po->prot_hook);
2206 sk->sk_err = ENETDOWN;
2207 if (!sock_flag(sk, SOCK_DEAD))
2208 sk->sk_error_report(sk);
2210 if (msg == NETDEV_UNREGISTER) {
2212 po->prot_hook.dev = NULL;
2214 spin_unlock(&po->bind_lock);
2218 if (dev->ifindex == po->ifindex) {
2219 spin_lock(&po->bind_lock);
2220 if (po->num && !po->running) {
2221 dev_add_pack(&po->prot_hook);
2225 spin_unlock(&po->bind_lock);
2235 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2238 struct sock *sk = sock->sk;
2243 int amount = sk_wmem_alloc_get(sk);
2245 return put_user(amount, (int __user *)arg);
2249 struct sk_buff *skb;
2252 spin_lock_bh(&sk->sk_receive_queue.lock);
2253 skb = skb_peek(&sk->sk_receive_queue);
2256 spin_unlock_bh(&sk->sk_receive_queue.lock);
2257 return put_user(amount, (int __user *)arg);
2260 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2262 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2272 case SIOCGIFBRDADDR:
2273 case SIOCSIFBRDADDR:
2274 case SIOCGIFNETMASK:
2275 case SIOCSIFNETMASK:
2276 case SIOCGIFDSTADDR:
2277 case SIOCSIFDSTADDR:
2279 return inet_dgram_ops.ioctl(sock, cmd, arg);
2283 return -ENOIOCTLCMD;
2288 static unsigned int packet_poll(struct file *file, struct socket *sock,
2291 struct sock *sk = sock->sk;
2292 struct packet_sock *po = pkt_sk(sk);
2293 unsigned int mask = datagram_poll(file, sock, wait);
2295 spin_lock_bh(&sk->sk_receive_queue.lock);
2296 if (po->rx_ring.pg_vec) {
2297 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2298 mask |= POLLIN | POLLRDNORM;
2300 spin_unlock_bh(&sk->sk_receive_queue.lock);
2301 spin_lock_bh(&sk->sk_write_queue.lock);
2302 if (po->tx_ring.pg_vec) {
2303 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2304 mask |= POLLOUT | POLLWRNORM;
2306 spin_unlock_bh(&sk->sk_write_queue.lock);
2311 /* Dirty? Well, I still did not learn better way to account
2315 static void packet_mm_open(struct vm_area_struct *vma)
2317 struct file *file = vma->vm_file;
2318 struct socket *sock = file->private_data;
2319 struct sock *sk = sock->sk;
2322 atomic_inc(&pkt_sk(sk)->mapped);
2325 static void packet_mm_close(struct vm_area_struct *vma)
2327 struct file *file = vma->vm_file;
2328 struct socket *sock = file->private_data;
2329 struct sock *sk = sock->sk;
2332 atomic_dec(&pkt_sk(sk)->mapped);
2335 static const struct vm_operations_struct packet_mmap_ops = {
2336 .open = packet_mm_open,
2337 .close = packet_mm_close,
2340 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2345 for (i = 0; i < len; i++) {
2346 if (likely(pg_vec[i].buffer)) {
2347 if (is_vmalloc_addr(pg_vec[i].buffer))
2348 vfree(pg_vec[i].buffer);
2350 free_pages((unsigned long)pg_vec[i].buffer,
2352 pg_vec[i].buffer = NULL;
2358 static inline char *alloc_one_pg_vec_page(unsigned long order)
2360 char *buffer = NULL;
2361 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2362 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2364 buffer = (char *) __get_free_pages(gfp_flags, order);
2370 * __get_free_pages failed, fall back to vmalloc
2372 buffer = vzalloc((1 << order) * PAGE_SIZE);
2378 * vmalloc failed, lets dig into swap here
2380 gfp_flags &= ~__GFP_NORETRY;
2381 buffer = (char *)__get_free_pages(gfp_flags, order);
2386 * complete and utter failure
2391 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2393 unsigned int block_nr = req->tp_block_nr;
2397 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2398 if (unlikely(!pg_vec))
2401 for (i = 0; i < block_nr; i++) {
2402 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
2403 if (unlikely(!pg_vec[i].buffer))
2404 goto out_free_pgvec;
2411 free_pg_vec(pg_vec, order, block_nr);
2416 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2417 int closing, int tx_ring)
2419 struct pgv *pg_vec = NULL;
2420 struct packet_sock *po = pkt_sk(sk);
2421 int was_running, order = 0;
2422 struct packet_ring_buffer *rb;
2423 struct sk_buff_head *rb_queue;
2427 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2428 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2432 if (atomic_read(&po->mapped))
2434 if (atomic_read(&rb->pending))
2438 if (req->tp_block_nr) {
2439 /* Sanity tests and some calculations */
2441 if (unlikely(rb->pg_vec))
2444 switch (po->tp_version) {
2446 po->tp_hdrlen = TPACKET_HDRLEN;
2449 po->tp_hdrlen = TPACKET2_HDRLEN;
2454 if (unlikely((int)req->tp_block_size <= 0))
2456 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2458 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2461 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2464 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2465 if (unlikely(rb->frames_per_block <= 0))
2467 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2472 order = get_order(req->tp_block_size);
2473 pg_vec = alloc_pg_vec(req, order);
2474 if (unlikely(!pg_vec))
2480 if (unlikely(req->tp_frame_nr))
2486 /* Detach socket from network */
2487 spin_lock(&po->bind_lock);
2488 was_running = po->running;
2491 __dev_remove_pack(&po->prot_hook);
2496 spin_unlock(&po->bind_lock);
2501 mutex_lock(&po->pg_vec_lock);
2502 if (closing || atomic_read(&po->mapped) == 0) {
2504 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2505 spin_lock_bh(&rb_queue->lock);
2506 pg_vec = XC(rb->pg_vec, pg_vec);
2507 rb->frame_max = (req->tp_frame_nr - 1);
2509 rb->frame_size = req->tp_frame_size;
2510 spin_unlock_bh(&rb_queue->lock);
2512 order = XC(rb->pg_vec_order, order);
2513 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2515 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2516 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2517 tpacket_rcv : packet_rcv;
2518 skb_queue_purge(rb_queue);
2520 if (atomic_read(&po->mapped))
2521 pr_err("packet_mmap: vma is busy: %d\n",
2522 atomic_read(&po->mapped));
2524 mutex_unlock(&po->pg_vec_lock);
2526 spin_lock(&po->bind_lock);
2527 if (was_running && !po->running) {
2531 dev_add_pack(&po->prot_hook);
2533 spin_unlock(&po->bind_lock);
2538 free_pg_vec(pg_vec, order, req->tp_block_nr);
2543 static int packet_mmap(struct file *file, struct socket *sock,
2544 struct vm_area_struct *vma)
2546 struct sock *sk = sock->sk;
2547 struct packet_sock *po = pkt_sk(sk);
2548 unsigned long size, expected_size;
2549 struct packet_ring_buffer *rb;
2550 unsigned long start;
2557 mutex_lock(&po->pg_vec_lock);
2560 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2562 expected_size += rb->pg_vec_len
2568 if (expected_size == 0)
2571 size = vma->vm_end - vma->vm_start;
2572 if (size != expected_size)
2575 start = vma->vm_start;
2576 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2577 if (rb->pg_vec == NULL)
2580 for (i = 0; i < rb->pg_vec_len; i++) {
2582 void *kaddr = rb->pg_vec[i].buffer;
2585 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
2586 page = pgv_to_page(kaddr);
2587 err = vm_insert_page(vma, start, page);
2596 atomic_inc(&po->mapped);
2597 vma->vm_ops = &packet_mmap_ops;
2601 mutex_unlock(&po->pg_vec_lock);
2605 static const struct proto_ops packet_ops_spkt = {
2606 .family = PF_PACKET,
2607 .owner = THIS_MODULE,
2608 .release = packet_release,
2609 .bind = packet_bind_spkt,
2610 .connect = sock_no_connect,
2611 .socketpair = sock_no_socketpair,
2612 .accept = sock_no_accept,
2613 .getname = packet_getname_spkt,
2614 .poll = datagram_poll,
2615 .ioctl = packet_ioctl,
2616 .listen = sock_no_listen,
2617 .shutdown = sock_no_shutdown,
2618 .setsockopt = sock_no_setsockopt,
2619 .getsockopt = sock_no_getsockopt,
2620 .sendmsg = packet_sendmsg_spkt,
2621 .recvmsg = packet_recvmsg,
2622 .mmap = sock_no_mmap,
2623 .sendpage = sock_no_sendpage,
2626 static const struct proto_ops packet_ops = {
2627 .family = PF_PACKET,
2628 .owner = THIS_MODULE,
2629 .release = packet_release,
2630 .bind = packet_bind,
2631 .connect = sock_no_connect,
2632 .socketpair = sock_no_socketpair,
2633 .accept = sock_no_accept,
2634 .getname = packet_getname,
2635 .poll = packet_poll,
2636 .ioctl = packet_ioctl,
2637 .listen = sock_no_listen,
2638 .shutdown = sock_no_shutdown,
2639 .setsockopt = packet_setsockopt,
2640 .getsockopt = packet_getsockopt,
2641 .sendmsg = packet_sendmsg,
2642 .recvmsg = packet_recvmsg,
2643 .mmap = packet_mmap,
2644 .sendpage = sock_no_sendpage,
2647 static const struct net_proto_family packet_family_ops = {
2648 .family = PF_PACKET,
2649 .create = packet_create,
2650 .owner = THIS_MODULE,
2653 static struct notifier_block packet_netdev_notifier = {
2654 .notifier_call = packet_notifier,
2657 #ifdef CONFIG_PROC_FS
2659 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2662 struct net *net = seq_file_net(seq);
2665 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2668 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2670 struct net *net = seq_file_net(seq);
2671 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2674 static void packet_seq_stop(struct seq_file *seq, void *v)
2680 static int packet_seq_show(struct seq_file *seq, void *v)
2682 if (v == SEQ_START_TOKEN)
2683 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2685 struct sock *s = sk_entry(v);
2686 const struct packet_sock *po = pkt_sk(s);
2689 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2691 atomic_read(&s->sk_refcnt),
2696 atomic_read(&s->sk_rmem_alloc),
2704 static const struct seq_operations packet_seq_ops = {
2705 .start = packet_seq_start,
2706 .next = packet_seq_next,
2707 .stop = packet_seq_stop,
2708 .show = packet_seq_show,
2711 static int packet_seq_open(struct inode *inode, struct file *file)
2713 return seq_open_net(inode, file, &packet_seq_ops,
2714 sizeof(struct seq_net_private));
2717 static const struct file_operations packet_seq_fops = {
2718 .owner = THIS_MODULE,
2719 .open = packet_seq_open,
2721 .llseek = seq_lseek,
2722 .release = seq_release_net,
2727 static int __net_init packet_net_init(struct net *net)
2729 spin_lock_init(&net->packet.sklist_lock);
2730 INIT_HLIST_HEAD(&net->packet.sklist);
2732 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2738 static void __net_exit packet_net_exit(struct net *net)
2740 proc_net_remove(net, "packet");
2743 static struct pernet_operations packet_net_ops = {
2744 .init = packet_net_init,
2745 .exit = packet_net_exit,
2749 static void __exit packet_exit(void)
2751 unregister_netdevice_notifier(&packet_netdev_notifier);
2752 unregister_pernet_subsys(&packet_net_ops);
2753 sock_unregister(PF_PACKET);
2754 proto_unregister(&packet_proto);
2757 static int __init packet_init(void)
2759 int rc = proto_register(&packet_proto, 0);
2764 sock_register(&packet_family_ops);
2765 register_pernet_subsys(&packet_net_ops);
2766 register_netdevice_notifier(&packet_netdev_notifier);
2771 module_init(packet_init);
2772 module_exit(packet_exit);
2773 MODULE_LICENSE("GPL");
2774 MODULE_ALIAS_NETPROTO(PF_PACKET);