tcp: limit GSO packets to half cwnd

[karo-tx-linux.git] / net / ipv4 / tcp_output.c
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 8d4eac79370057907e7f6031ba1fedbf120227dd..eb73a1dccf56b823a45c0ca034e40dc50fc48068 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
  static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
  {
         struct tcp_sock *tp = tcp_sk(sk);
+       bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+                      tcp_ca_needs_ecn(sk);
+
+       if (!use_ecn) {
+               const struct dst_entry *dst = __sk_dst_get(sk);
+
+               if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+                       use_ecn = true;
+       }
  
         tp->ecn_flags = 0;
-       if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
-           tcp_ca_needs_ecn(sk)) {
+
+       if (use_ecn) {
                 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
                 tp->ecn_flags = TCP_ECN_OK;
                 if (tcp_ca_needs_ecn(sk))
@@ -839,26 +848,38 @@ void tcp_wfree(struct sk_buff *skb)
  {
         struct sock *sk = skb->sk;
         struct tcp_sock *tp = tcp_sk(sk);
+       int wmem;
+
+       /* Keep one reference on sk_wmem_alloc.
+        * Will be released by sk_free() from here or tcp_tasklet_func()
+        */
+       wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+       /* If this softirq is serviced by ksoftirqd, we are likely under stress.
+        * Wait until our queues (qdisc + devices) are drained.
+        * This gives :
+        * - less callbacks to tcp_write_xmit(), reducing stress (batches)
+        * - chance for incoming ACK (processed by another cpu maybe)
+        *   to migrate this flow (skb->ooo_okay will be eventually set)
+        */
+       if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+               goto out;
  
         if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
             !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
                 unsigned long flags;
                 struct tsq_tasklet *tsq;
  
-               /* Keep a ref on socket.
-                * This last ref will be released in tcp_tasklet_func()
-                */
-               atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
-
                 /* queue this socket to tasklet queue */
                 local_irq_save(flags);
-               tsq = &__get_cpu_var(tsq_tasklet);
+               tsq = this_cpu_ptr(&tsq_tasklet);
                 list_add(&tp->tsq_node, &tsq->head);
                 tasklet_schedule(&tsq->tasklet);
                 local_irq_restore(flags);
-       } else {
-               sock_wfree(skb);
+               return;
         }
+out:
+       sk_free(sk);
  }
  
  /* This routine actually transmits TCP packets queued in by
@@ -914,9 +935,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                 tcp_ca_event(sk, CA_EVENT_TX_START);
  
         /* if no packet is in qdisc/device queue, then allow XPS to select
-        * another queue.
+        * another queue. We can be called from tcp_tsq_handler()
+        * which holds one reference to sk_wmem_alloc.
+        *
+        * TODO: Ideally, in-flight pure ACK packets should not matter here.
+        * One way to get this would be to set skb->truesize = 2 on them.
          */
-       skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
+       skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
  
         skb_push(skb, tcp_header_size);
         skb_reset_transport_header(skb);
@@ -1537,7 +1562,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
  static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
                                          const struct sk_buff *skb)
  {
-       u32 in_flight, cwnd;
+       u32 in_flight, cwnd, halfcwnd;
  
         /* Don't be strict about the congestion window for the final FIN.  */
         if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
@@ -1546,10 +1571,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
  
         in_flight = tcp_packets_in_flight(tp);
         cwnd = tp->snd_cwnd;
-       if (in_flight < cwnd)
-               return (cwnd - in_flight);
+       if (in_flight >= cwnd)
+               return 0;
  
-       return 0;
+       /* For better scheduling, ensure we have at least
+        * 2 GSO packets in flight.
+        */
+       halfcwnd = max(cwnd >> 1, 1U);
+       return min(halfcwnd, cwnd - in_flight);
  }
  
  /* Initialize TSO state of a skb.
@@ -2110,7 +2139,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
  static bool skb_still_in_host_queue(const struct sock *sk,
                                     const struct sk_buff *skb)
  {
-       if (unlikely(skb_fclone_busy(skb))) {
+       if (unlikely(skb_fclone_busy(sk, skb))) {
                 NET_INC_STATS_BH(sock_net(sk),
                                  LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
                 return true;