]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - net/ipv4/udp.c
udp: preserve head state for IP_CMSG_PASSSEC
[karo-tx-linux.git] / net / ipv4 / udp.c
index 1d6219bf2d6b48abaa73ad7f178049d95124cc90..d243772f6efc91eb2e975fec8f6ff3414b1c52cf 100644 (file)
@@ -577,7 +577,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 
        sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
                               dif, &udp_table, NULL);
-       if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+       if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
 }
@@ -1163,23 +1163,60 @@ out:
        return ret;
 }
 
+#define UDP_SKB_IS_STATELESS 0x80000000
+
+static void udp_set_dev_scratch(struct sk_buff *skb)
+{
+       struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
+
+       BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
+       scratch->_tsize_state = skb->truesize;
+#if BITS_PER_LONG == 64
+       scratch->len = skb->len;
+       scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
+       scratch->is_linear = !skb_is_nonlinear(skb);
+#endif
+       if (likely(!skb->_skb_refdst))
+               scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
+}
+
+static int udp_skb_truesize(struct sk_buff *skb)
+{
+       return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
+}
+
+static bool udp_skb_has_head_state(struct sk_buff *skb)
+{
+       return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
+}
+
 /* fully reclaim rmem/fwd memory allocated for skb */
-static void udp_rmem_release(struct sock *sk, int size, int partial)
+static void udp_rmem_release(struct sock *sk, int size, int partial,
+                            bool rx_queue_lock_held)
 {
        struct udp_sock *up = udp_sk(sk);
+       struct sk_buff_head *sk_queue;
        int amt;
 
        if (likely(partial)) {
                up->forward_deficit += size;
                size = up->forward_deficit;
                if (size < (sk->sk_rcvbuf >> 2) &&
-                   !skb_queue_empty(&sk->sk_receive_queue))
+                   !skb_queue_empty(&up->reader_queue))
                        return;
        } else {
                size += up->forward_deficit;
        }
        up->forward_deficit = 0;
 
+       /* acquire the sk_receive_queue for fwd allocated memory scheduling,
+        * if the called don't held it already
+        */
+       sk_queue = &sk->sk_receive_queue;
+       if (!rx_queue_lock_held)
+               spin_lock(&sk_queue->lock);
+
+
        sk->sk_forward_alloc += size;
        amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
        sk->sk_forward_alloc -= amt;
@@ -1188,19 +1225,33 @@ static void udp_rmem_release(struct sock *sk, int size, int partial)
                __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 
        atomic_sub(size, &sk->sk_rmem_alloc);
+
+       /* this can save us from acquiring the rx queue lock on next receive */
+       skb_queue_splice_tail_init(sk_queue, &up->reader_queue);
+
+       if (!rx_queue_lock_held)
+               spin_unlock(&sk_queue->lock);
 }
 
-/* Note: called with sk_receive_queue.lock held.
+/* Note: called with reader_queue.lock held.
  * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
  * This avoids a cache line miss while receive_queue lock is held.
  * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
  */
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 {
-       udp_rmem_release(sk, skb->dev_scratch, 1);
+       prefetch(&skb->data);
+       udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
 }
 EXPORT_SYMBOL(udp_skb_destructor);
 
+/* as above, but the caller held the rx queue lock, too */
+static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
+{
+       prefetch(&skb->data);
+       udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
+}
+
 /* Idea of busylocks is to let producers grab an extra spinlock
  * to relieve pressure on the receive_queue spinlock shared by consumer.
  * Under flood, this means that only one producer can be in line
@@ -1252,10 +1303,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
                busy = busylock_acquire(sk);
        }
        size = skb->truesize;
-       /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
-        * in udp_skb_destructor()
-        */
-       skb->dev_scratch = size;
+       udp_set_dev_scratch(skb);
 
        /* we drop only if the receive buf is full and the receive
         * queue contains some other skb
@@ -1306,14 +1354,16 @@ EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
 void udp_destruct_sock(struct sock *sk)
 {
        /* reclaim completely the forward allocated memory */
+       struct udp_sock *up = udp_sk(sk);
        unsigned int total = 0;
        struct sk_buff *skb;
 
-       while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+       skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
+       while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
                total += skb->truesize;
                kfree_skb(skb);
        }
-       udp_rmem_release(sk, total, 0);
+       udp_rmem_release(sk, total, 0, true);
 
        inet_sock_destruct(sk);
 }
@@ -1321,6 +1371,7 @@ EXPORT_SYMBOL_GPL(udp_destruct_sock);
 
 int udp_init_sock(struct sock *sk)
 {
+       skb_queue_head_init(&udp_sk(sk)->reader_queue);
        sk->sk_destruct = udp_destruct_sock;
        return 0;
 }
@@ -1334,10 +1385,43 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
                sk_peek_offset_bwd(sk, len);
                unlock_sock_fast(sk, slow);
        }
-       consume_skb(skb);
+
+       /* In the more common cases we cleared the head states previously,
+        * see __udp_queue_rcv_skb().
+        */
+       if (unlikely(udp_skb_has_head_state(skb)))
+               skb_release_head_state(skb);
+       consume_stateless_skb(skb);
 }
 EXPORT_SYMBOL_GPL(skb_consume_udp);
 
+static struct sk_buff *__first_packet_length(struct sock *sk,
+                                            struct sk_buff_head *rcvq,
+                                            int *total)
+{
+       struct sk_buff *skb;
+
+       while ((skb = skb_peek(rcvq)) != NULL) {
+               if (udp_lib_checksum_complete(skb)) {
+                       __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
+                                       IS_UDPLITE(sk));
+                       __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+                                       IS_UDPLITE(sk));
+                       atomic_inc(&sk->sk_drops);
+                       __skb_unlink(skb, rcvq);
+                       *total += skb->truesize;
+                       kfree_skb(skb);
+               } else {
+                       /* the csum related bits could be changed, refresh
+                        * the scratch area
+                        */
+                       udp_set_dev_scratch(skb);
+                       break;
+               }
+       }
+       return skb;
+}
+
 /**
  *     first_packet_length     - return length of first packet in receive queue
  *     @sk: socket
@@ -1347,26 +1431,24 @@ EXPORT_SYMBOL_GPL(skb_consume_udp);
  */
 static int first_packet_length(struct sock *sk)
 {
-       struct sk_buff_head *rcvq = &sk->sk_receive_queue;
+       struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
+       struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
        struct sk_buff *skb;
        int total = 0;
        int res;
 
        spin_lock_bh(&rcvq->lock);
-       while ((skb = skb_peek(rcvq)) != NULL &&
-               udp_lib_checksum_complete(skb)) {
-               __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
-                               IS_UDPLITE(sk));
-               __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
-                               IS_UDPLITE(sk));
-               atomic_inc(&sk->sk_drops);
-               __skb_unlink(skb, rcvq);
-               total += skb->truesize;
-               kfree_skb(skb);
+       skb = __first_packet_length(sk, rcvq, &total);
+       if (!skb && !skb_queue_empty(sk_queue)) {
+               spin_lock(&sk_queue->lock);
+               skb_queue_splice_tail_init(sk_queue, rcvq);
+               spin_unlock(&sk_queue->lock);
+
+               skb = __first_packet_length(sk, rcvq, &total);
        }
        res = skb ? skb->len : -1;
        if (total)
-               udp_rmem_release(sk, total, 1);
+               udp_rmem_release(sk, total, 1, false);
        spin_unlock_bh(&rcvq->lock);
        return res;
 }
@@ -1400,6 +1482,77 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 }
 EXPORT_SYMBOL(udp_ioctl);
 
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
+                              int noblock, int *peeked, int *off, int *err)
+{
+       struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+       struct sk_buff_head *queue;
+       struct sk_buff *last;
+       long timeo;
+       int error;
+
+       queue = &udp_sk(sk)->reader_queue;
+       flags |= noblock ? MSG_DONTWAIT : 0;
+       timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+       do {
+               struct sk_buff *skb;
+
+               error = sock_error(sk);
+               if (error)
+                       break;
+
+               error = -EAGAIN;
+               *peeked = 0;
+               do {
+                       spin_lock_bh(&queue->lock);
+                       skb = __skb_try_recv_from_queue(sk, queue, flags,
+                                                       udp_skb_destructor,
+                                                       peeked, off, err,
+                                                       &last);
+                       if (skb) {
+                               spin_unlock_bh(&queue->lock);
+                               return skb;
+                       }
+
+                       if (skb_queue_empty(sk_queue)) {
+                               spin_unlock_bh(&queue->lock);
+                               goto busy_check;
+                       }
+
+                       /* refill the reader queue and walk it again
+                        * keep both queues locked to avoid re-acquiring
+                        * the sk_receive_queue lock if fwd memory scheduling
+                        * is needed.
+                        */
+                       spin_lock(&sk_queue->lock);
+                       skb_queue_splice_tail_init(sk_queue, queue);
+
+                       skb = __skb_try_recv_from_queue(sk, queue, flags,
+                                                       udp_skb_dtor_locked,
+                                                       peeked, off, err,
+                                                       &last);
+                       spin_unlock(&sk_queue->lock);
+                       spin_unlock_bh(&queue->lock);
+                       if (skb)
+                               return skb;
+
+busy_check:
+                       if (!sk_can_busy_loop(sk))
+                               break;
+
+                       sk_busy_loop(sk, flags & MSG_DONTWAIT);
+               } while (!skb_queue_empty(sk_queue));
+
+               /* sk_queue is empty, reader_queue may contain peeked packets */
+       } while (timeo &&
+                !__skb_wait_for_more_packets(sk, &error, &timeo,
+                                             (struct sk_buff *)sk_queue));
+
+       *err = error;
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(__skb_recv_udp);
+
 /*
  *     This should be easy, if there is something there we
  *     return it, otherwise we block.
@@ -1426,7 +1579,7 @@ try_again:
        if (!skb)
                return err;
 
-       ulen = skb->len;
+       ulen = udp_skb_len(skb);
        copied = len;
        if (copied > ulen - off)
                copied = ulen - off;
@@ -1441,14 +1594,18 @@ try_again:
 
        if (copied < ulen || peeking ||
            (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
-               checksum_valid = !udp_lib_checksum_complete(skb);
+               checksum_valid = udp_skb_csum_unnecessary(skb) ||
+                               !__udp_lib_checksum_complete(skb);
                if (!checksum_valid)
                        goto csum_copy_err;
        }
 
-       if (checksum_valid || skb_csum_unnecessary(skb))
-               err = skb_copy_datagram_msg(skb, off, msg, copied);
-       else {
+       if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
+               if (udp_skb_is_linear(skb))
+                       err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
+               else
+                       err = skb_copy_datagram_msg(skb, off, msg, copied);
+       } else {
                err = skb_copy_and_csum_datagram_msg(skb, off, msg);
 
                if (err == -EINVAL)
@@ -1490,7 +1647,8 @@ try_again:
        return err;
 
 csum_copy_err:
-       if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) {
+       if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
+                                udp_skb_destructor)) {
                UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
                UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
        }
@@ -1624,6 +1782,13 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                sk_mark_napi_id_once(sk, skb);
        }
 
+       /* At recvmsg() time we may access skb->dst or skb->sp depending on
+        * the IP options and the cmsg flags, elsewhere can we clear all
+        * pending head states while they are hot in the cache
+        */
+       if (likely(IPCB(skb)->opt.optlen == 0 && !skb->sp))
+               skb_release_head_state(skb);
+
        rc = __udp_enqueue_schedule_skb(sk, skb);
        if (rc < 0) {
                int is_udplite = IS_UDPLITE(sk);
@@ -1738,6 +1903,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                }
        }
 
+       prefetch(&sk->sk_rmem_alloc);
        if (rcu_access_pointer(sk->sk_filter) &&
            udp_lib_checksum_complete(skb))
                        goto csum_error;
@@ -1766,9 +1932,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
 {
        struct dst_entry *old;
 
-       dst_hold(dst);
-       old = xchg(&sk->sk_rx_dst, dst);
-       dst_release(old);
+       if (dst_hold_safe(dst)) {
+               old = xchg(&sk->sk_rx_dst, dst);
+               dst_release(old);
+       }
 }
 
 /*
@@ -2082,7 +2249,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
                                             uh->source, iph->saddr, dif);
        }
 
-       if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
+       if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
                return;
 
        skb->sk = sk;
@@ -2092,13 +2259,11 @@ void udp_v4_early_demux(struct sk_buff *skb)
        if (dst)
                dst = dst_check(dst, 0);
        if (dst) {
-               /* DST_NOCACHE can not be used without taking a reference */
-               if (dst->flags & DST_NOCACHE) {
-                       if (likely(atomic_inc_not_zero(&dst->__refcnt)))
-                               skb_dst_set(skb, dst);
-               } else {
-                       skb_dst_set_noref(skb, dst);
-               }
+               /* set noref for now.
+                * any place which wants to hold dst has to call
+                * dst_hold_safe()
+                */
+               skb_dst_set_noref(skb, dst);
        }
 }
 
@@ -2325,6 +2490,9 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
        unsigned int mask = datagram_poll(file, sock, wait);
        struct sock *sk = sock->sk;
 
+       if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
+               mask |= POLLIN | POLLRDNORM;
+
        sock_rps_record_flow(sk);
 
        /* Check for false positives due to checksum errors */
@@ -2530,7 +2698,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
                0, 0L, 0,
                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
                0, sock_i_ino(sp),
-               atomic_read(&sp->sk_refcnt), sp,
+               refcount_read(&sp->sk_refcnt), sp,
                atomic_read(&sp->sk_drops));
 }