tcp: remove poll() flakes when receiving RST

[karo-tx-linux.git] / net / ipv4 / tcp_input.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index c43119726a62e494063fd940001b483215d0fe26..37e2aa925f62395cfb48145cd3a76b6afebb64b1 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -126,7 +126,8 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
  #define REXMIT_LOST    1 /* retransmit packets marked lost */
  #define REXMIT_NEW     2 /* FRTO-style transmit of unsent/new packets */
  
-static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
+                            unsigned int len)
  {
         static bool __once __read_mostly;
  
@@ -137,8 +138,9 @@ static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
  
                 rcu_read_lock();
                 dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
-               pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
-                       dev ? dev->name : "Unknown driver");
+               if (!dev || len >= dev->mtu)
+                       pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+                               dev ? dev->name : "Unknown driver");
                 rcu_read_unlock();
         }
  }
@@ -161,8 +163,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
         if (len >= icsk->icsk_ack.rcv_mss) {
                 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
                                                tcp_sk(sk)->advmss);
-               if (unlikely(icsk->icsk_ack.rcv_mss != len))
-                       tcp_gro_dev_warn(sk, skb);
+               /* Account for possibly-removed options */
+               if (unlikely(len > icsk->icsk_ack.rcv_mss +
+                                  MAX_TCP_OPTION_SPACE))
+                       tcp_gro_dev_warn(sk, skb, len);
         } else {
                 /* Otherwise, we make more careful check taking into account,
                  * that SACKs block is variable.
@@ -874,22 +878,11 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
                                   const int ts)
  {
         struct tcp_sock *tp = tcp_sk(sk);
-       if (metric > tp->reordering) {
-               int mib_idx;
+       int mib_idx;
  
+       if (metric > tp->reordering) {
                 tp->reordering = min(sysctl_tcp_max_reordering, metric);
  
-               /* This exciting event is worth to be remembered. 8) */
-               if (ts)
-                       mib_idx = LINUX_MIB_TCPTSREORDER;
-               else if (tcp_is_reno(tp))
-                       mib_idx = LINUX_MIB_TCPRENOREORDER;
-               else if (tcp_is_fack(tp))
-                       mib_idx = LINUX_MIB_TCPFACKREORDER;
-               else
-                       mib_idx = LINUX_MIB_TCPSACKREORDER;
-
-               NET_INC_STATS(sock_net(sk), mib_idx);
  #if FASTRETRANS_DEBUG > 1
                 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
                          tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -902,6 +895,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
         }
  
         tp->rack.reord = 1;
+
+       /* This exciting event is worth to be remembered. 8) */
+       if (ts)
+               mib_idx = LINUX_MIB_TCPTSREORDER;
+       else if (tcp_is_reno(tp))
+               mib_idx = LINUX_MIB_TCPRENOREORDER;
+       else if (tcp_is_fack(tp))
+               mib_idx = LINUX_MIB_TCPFACKREORDER;
+       else
+               mib_idx = LINUX_MIB_TCPSACKREORDER;
+
+       NET_INC_STATS(sock_net(sk), mib_idx);
  }
  
  /* This must be called before lost_out is incremented */
@@ -1930,6 +1935,7 @@ void tcp_enter_loss(struct sock *sk)
         struct tcp_sock *tp = tcp_sk(sk);
         struct net *net = sock_net(sk);
         struct sk_buff *skb;
+       bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
         bool is_reneg;                  /* is receiver reneging on SACKs? */
         bool mark_lost;
  
@@ -1989,15 +1995,18 @@ void tcp_enter_loss(struct sock *sk)
         tp->high_seq = tp->snd_nxt;
         tcp_ecn_queue_cwr(tp);
  
-       /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
-        * if a previous recovery is underway, otherwise it may incorrectly
-        * call a timeout spurious if some previously retransmitted packets
-        * are s/acked (sec 3.2). We do not apply that retriction since
-        * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
-        * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
-        * on PTMU discovery to avoid sending new data.
+       /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+        * loss recovery is underway except recurring timeout(s) on
+        * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+        *
+        * In theory F-RTO can be used repeatedly during loss recovery.
+        * In practice this interacts badly with broken middle-boxes that
+        * falsely raise the receive window, which results in repeated
+        * timeouts and stop-and-go behavior.
          */
-       tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
+       tp->frto = sysctl_tcp_frto &&
+                  (new_recovery || icsk->icsk_retransmits) &&
+                  !inet_csk(sk)->icsk_mtup.probe_size;
  }
  
  /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -3759,11 +3768,12 @@ void tcp_parse_options(const struct sk_buff *skb,
                                     !estab && sysctl_tcp_window_scaling) {
                                         __u8 snd_wscale = *(__u8 *)ptr;
                                         opt_rx->wscale_ok = 1;
-                                       if (snd_wscale > 14) {
-                                               net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+                                       if (snd_wscale > TCP_MAX_WSCALE) {
+                                               net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
                                                                      __func__,
-                                                                    snd_wscale);
-                                               snd_wscale = 14;
+                                                                    snd_wscale,
+                                                                    TCP_MAX_WSCALE);
+                                               snd_wscale = TCP_MAX_WSCALE;
                                         }
                                         opt_rx->snd_wscale = snd_wscale;
                                 }
@@ -3998,10 +4008,10 @@ void tcp_reset(struct sock *sk)
         /* This barrier is coupled with smp_rmb() in tcp_poll() */
         smp_wmb();
  
+       tcp_done(sk);
+
         if (!sock_flag(sk, SOCK_DEAD))
                 sk->sk_error_report(sk);
-
-       tcp_done(sk);
  }
  
  /*
@@ -6324,36 +6334,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                 goto drop_and_free;
  
         if (isn && tmp_opt.tstamp_ok)
-               af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+               af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off);
  
         if (!want_cookie && !isn) {
-               /* VJ's idea. We save last timestamp seen
-                * from the destination in peer table, when entering
-                * state TIME-WAIT, and check against it before
-                * accepting new connection request.
-                *
-                * If "isn" is not zero, this request hit alive
-                * timewait bucket, so that all the necessary checks
-                * are made in the function processing timewait state.
-                */
-               if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
-                       bool strict;
-
-                       dst = af_ops->route_req(sk, &fl, req, &strict);
-
-                       if (dst && strict &&
-                           !tcp_peer_is_proven(req, dst, true,
-                                               tmp_opt.saw_tstamp)) {
-                               NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
-                               goto drop_and_release;
-                       }
-               }
                 /* Kill the following clause, if you dislike this way. */
-               else if (!net->ipv4.sysctl_tcp_syncookies &&
-                        (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
-                         (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
-                        !tcp_peer_is_proven(req, dst, false,
-                                            tmp_opt.saw_tstamp)) {
+               if (!net->ipv4.sysctl_tcp_syncookies &&
+                   (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+                    (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
+                   !tcp_peer_is_proven(req, dst)) {
                         /* Without syncookies last quarter of
                          * backlog is filled with destinations,
                          * proven to be alive.
@@ -6366,10 +6354,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                         goto drop_and_release;
                 }
  
-               isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+               isn = af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off);
         }
         if (!dst) {
-               dst = af_ops->route_req(sk, &fl, req, NULL);
+               dst = af_ops->route_req(sk, &fl, req);
                 if (!dst)
                         goto drop_and_free;
         }