]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - net/ipv4/tcp_output.c
tcp: RFC7413 option support for Fast Open client
[karo-tx-linux.git] / net / ipv4 / tcp_output.c
index a2a796c5536b032264e2a71f596f673e8307f25c..e662d85d1635d0269b669bb0f726760be3bae0d2 100644 (file)
@@ -518,17 +518,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 
        if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
                struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+               u8 *p = (u8 *)ptr;
+               u32 len; /* Fast Open option length */
+
+               if (foc->exp) {
+                       len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+                       *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+                                    TCPOPT_FASTOPEN_MAGIC);
+                       p += TCPOLEN_EXP_FASTOPEN_BASE;
+               } else {
+                       len = TCPOLEN_FASTOPEN_BASE + foc->len;
+                       *p++ = TCPOPT_FASTOPEN;
+                       *p++ = len;
+               }
 
-               *ptr++ = htonl((TCPOPT_EXP << 24) |
-                              ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
-                              TCPOPT_FASTOPEN_MAGIC);
-
-               memcpy(ptr, foc->val, foc->len);
-               if ((foc->len & 3) == 2) {
-                       u8 *align = ((u8 *)ptr) + foc->len;
-                       align[0] = align[1] = TCPOPT_NOP;
+               memcpy(p, foc->val, foc->len);
+               if ((len & 3) == 2) {
+                       p[foc->len] = TCPOPT_NOP;
+                       p[foc->len + 1] = TCPOPT_NOP;
                }
-               ptr += (foc->len + 3) >> 2;
+               ptr += (len + 3) >> 2;
        }
 }
 
@@ -565,7 +574,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
        opts->mss = tcp_advertise_mss(sk);
        remaining -= TCPOLEN_MSS_ALIGNED;
 
-       if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+       if (likely(sysctl_tcp_timestamps && !*md5)) {
                opts->options |= OPTION_TS;
                opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
                opts->tsecr = tp->rx_opt.ts_recent;
@@ -583,13 +592,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
        }
 
        if (fastopen && fastopen->cookie.len >= 0) {
-               u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+               u32 need = fastopen->cookie.len;
+
+               need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+                                              TCPOLEN_FASTOPEN_BASE;
                need = (need + 3) & ~3U;  /* Align to 32 bits */
                if (remaining >= need) {
                        opts->options |= OPTION_FAST_OPEN_COOKIE;
                        opts->fastopen_cookie = &fastopen->cookie;
                        remaining -= need;
                        tp->syn_fastopen = 1;
+                       tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
                }
        }
 
@@ -601,15 +614,14 @@ static unsigned int tcp_synack_options(struct sock *sk,
                                   struct request_sock *req,
                                   unsigned int mss, struct sk_buff *skb,
                                   struct tcp_out_options *opts,
-                                  struct tcp_md5sig_key **md5,
+                                  const struct tcp_md5sig_key *md5,
                                   struct tcp_fastopen_cookie *foc)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;
 
 #ifdef CONFIG_TCP_MD5SIG
-       *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
-       if (*md5) {
+       if (md5) {
                opts->options |= OPTION_MD5;
                remaining -= TCPOLEN_MD5SIG_ALIGNED;
 
@@ -620,8 +632,6 @@ static unsigned int tcp_synack_options(struct sock *sk,
                 */
                ireq->tstamp_ok &= !ireq->sack_ok;
        }
-#else
-       *md5 = NULL;
 #endif
 
        /* We always send an MSS option. */
@@ -645,7 +655,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
        if (foc != NULL && foc->len >= 0) {
-               u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+               u32 need = foc->len;
+
+               need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+                                  TCPOLEN_FASTOPEN_BASE;
                need = (need + 3) & ~3U;  /* Align to 32 bits */
                if (remaining >= need) {
                        opts->options |= OPTION_FAST_OPEN_COOKIE;
@@ -989,7 +1002,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        if (md5) {
                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
                tp->af_specific->calc_md5_hash(opts.hash_location,
-                                              md5, sk, NULL, skb);
+                                              md5, sk, skb);
        }
 #endif
 
@@ -1151,7 +1164,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
        /* Get a new skb... force flag on. */
        buff = sk_stream_alloc_skb(sk, nsize, gfp);
-       if (buff == NULL)
+       if (!buff)
                return -ENOMEM; /* We'll just try again later. */
 
        sk->sk_wmem_queued += buff->truesize;
@@ -1354,6 +1367,8 @@ void tcp_mtup_init(struct sock *sk)
                               icsk->icsk_af_ops->net_header_len;
        icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
        icsk->icsk_mtup.probe_size = 0;
+       if (icsk->icsk_mtup.enabled)
+               icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
 }
 EXPORT_SYMBOL(tcp_mtup_init);
 
@@ -1708,7 +1723,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
                return tcp_fragment(sk, skb, len, mss_now, gfp);
 
        buff = sk_stream_alloc_skb(sk, 0, gfp);
-       if (unlikely(buff == NULL))
+       if (unlikely(!buff))
                return -ENOMEM;
 
        sk->sk_wmem_queued += buff->truesize;
@@ -1752,20 +1767,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
                                 bool *is_cwnd_limited, u32 max_segs)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
-       u32 send_win, cong_win, limit, in_flight;
+       u32 age, send_win, cong_win, limit, in_flight;
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct skb_mstamp now;
+       struct sk_buff *head;
        int win_divisor;
 
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                goto send_now;
 
-       if (icsk->icsk_ca_state != TCP_CA_Open)
+       if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
                goto send_now;
 
-       /* Defer for less than two clock ticks. */
-       if (tp->tso_deferred &&
-           (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
+       /* Avoid bursty behavior by allowing defer
+        * only if the last write was recent.
+        */
+       if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
                goto send_now;
 
        in_flight = tcp_packets_in_flight(tp);
@@ -1807,11 +1825,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
                        goto send_now;
        }
 
-       /* Ok, it looks like it is advisable to defer.
-        * Do not rearm the timer if already set to not break TCP ACK clocking.
-        */
-       if (!tp->tso_deferred)
-               tp->tso_deferred = 1 | (jiffies << 1);
+       head = tcp_write_queue_head(sk);
+       skb_mstamp_get(&now);
+       age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+       /* If next ACK is likely to come too late (half srtt), do not defer */
+       if (age < (tp->srtt_us >> 4))
+               goto send_now;
+
+       /* Ok, it looks like it is advisable to defer. */
 
        if (cong_win < send_win && cong_win < skb->len)
                *is_cwnd_limited = true;
@@ -1819,10 +1840,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
        return true;
 
 send_now:
-       tp->tso_deferred = 0;
        return false;
 }
 
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct net *net = sock_net(sk);
+       u32 interval;
+       s32 delta;
+
+       interval = net->ipv4.sysctl_tcp_probe_interval;
+       delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+       if (unlikely(delta >= interval * HZ)) {
+               int mss = tcp_current_mss(sk);
+
+               /* Update current search range */
+               icsk->icsk_mtup.probe_size = 0;
+               icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+                       sizeof(struct tcphdr) +
+                       icsk->icsk_af_ops->net_header_len;
+               icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+               /* Update probe time stamp */
+               icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+       }
+}
+
 /* Create a new MTU probe if we are ready.
  * MTU probe is regularly attempting to increase the path MTU by
  * deliberately sending larger packets.  This discovers routing
@@ -1837,11 +1882,13 @@ static int tcp_mtu_probe(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb, *nskb, *next;
+       struct net *net = sock_net(sk);
        int len;
        int probe_size;
        int size_needed;
        int copy;
        int mss_now;
+       int interval;
 
        /* Not currently probing/verifying,
         * not in recovery,
@@ -1854,12 +1901,25 @@ static int tcp_mtu_probe(struct sock *sk)
            tp->rx_opt.num_sacks || tp->rx_opt.dsack)
                return -1;
 
-       /* Very simple search strategy: just double the MSS. */
+       /* Use binary search for probe_size between tcp_mss_base,
+        * and current mss_clamp. if (search_high - search_low)
+        * smaller than a threshold, backoff from probing.
+        */
        mss_now = tcp_current_mss(sk);
-       probe_size = 2 * tp->mss_cache;
+       probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+                                   icsk->icsk_mtup.search_low) >> 1);
        size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
-       if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
-               /* TODO: set timer for probe_converge_event */
+       interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+       /* When misfortune happens, we are reprobing actively,
+        * and then reprobe timer has expired. We stick with current
+        * probing process by not resetting search range to its orignal.
+        */
+       if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+               interval < net->ipv4.sysctl_tcp_probe_threshold) {
+               /* Check whether enough time has elaplased for
+                * another round of probing.
+                */
+               tcp_mtu_check_reprobe(sk);
                return -1;
        }
 
@@ -1881,7 +1941,8 @@ static int tcp_mtu_probe(struct sock *sk)
        }
 
        /* We're allowed to probe.  Build it now. */
-       if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+       nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+       if (!nskb)
                return -1;
        sk->sk_wmem_queued += nskb->truesize;
        sk_mem_charge(sk, nskb->truesize);
@@ -2179,7 +2240,7 @@ void tcp_send_loss_probe(struct sock *sk)
        int mss = tcp_current_mss(sk);
        int err = -1;
 
-       if (tcp_send_head(sk) != NULL) {
+       if (tcp_send_head(sk)) {
                err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
                goto rearm_timer;
        }
@@ -2689,7 +2750,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                if (skb == tcp_send_head(sk))
                        break;
                /* we could do better than to assign each time */
-               if (hole == NULL)
+               if (!hole)
                        tp->retransmit_skb_hint = skb;
 
                /* Assume this retransmit will generate
@@ -2713,7 +2774,7 @@ begin_fwd:
                        if (!tcp_can_forward_retransmit(sk))
                                break;
                        /* Backtrack if necessary to non-L'ed skb */
-                       if (hole != NULL) {
+                       if (hole) {
                                skb = hole;
                                hole = NULL;
                        }
@@ -2721,7 +2782,7 @@ begin_fwd:
                        goto begin_fwd;
 
                } else if (!(sacked & TCPCB_LOST)) {
-                       if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+                       if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
                                hole = skb;
                        continue;
 
@@ -2766,22 +2827,18 @@ void tcp_send_fin(struct sock *sk)
         */
        mss_now = tcp_current_mss(sk);
 
-       if (tcp_send_head(sk) != NULL) {
+       if (tcp_send_head(sk)) {
                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
                TCP_SKB_CB(skb)->end_seq++;
                tp->write_seq++;
        } else {
                /* Socket is locked, keep trying until memory is available. */
                for (;;) {
-                       skb = alloc_skb_fclone(MAX_TCP_HEADER,
-                                              sk->sk_allocation);
+                       skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
                        if (skb)
                                break;
                        yield();
                }
-
-               /* Reserve space for headers and prepare control bits. */
-               skb_reserve(skb, MAX_TCP_HEADER);
                /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
                tcp_init_nondata_skb(skb, tp->write_seq,
                                     TCPHDR_ACK | TCPHDR_FIN);
@@ -2828,14 +2885,14 @@ int tcp_send_synack(struct sock *sk)
        struct sk_buff *skb;
 
        skb = tcp_write_queue_head(sk);
-       if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+       if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                pr_debug("%s: wrong queue state\n", __func__);
                return -EFAULT;
        }
        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
                if (skb_cloned(skb)) {
                        struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
-                       if (nskb == NULL)
+                       if (!nskb)
                                return -ENOMEM;
                        tcp_unlink_write_queue(skb, sk);
                        __skb_header_release(nskb);
@@ -2870,7 +2927,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcphdr *th;
        struct sk_buff *skb;
-       struct tcp_md5sig_key *md5;
+       struct tcp_md5sig_key *md5 = NULL;
        int tcp_header_size;
        int mss;
 
@@ -2883,7 +2940,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        skb_reserve(skb, MAX_TCP_HEADER);
 
        skb_dst_set(skb, dst);
-       security_skb_owned_by(skb, sk);
 
        mss = dst_metric_advmss(dst);
        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2896,7 +2952,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        else
 #endif
        skb_mstamp_get(&skb->skb_mstamp);
-       tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+
+#ifdef CONFIG_TCP_MD5SIG
+       rcu_read_lock();
+       md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+#endif
+       tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
                                             foc) + sizeof(*th);
 
        skb_push(skb, tcp_header_size);
@@ -2927,10 +2988,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 #ifdef CONFIG_TCP_MD5SIG
        /* Okay, we have all we need - do the md5 hash if needed */
-       if (md5) {
+       if (md5)
                tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
-                                              md5, NULL, req, skb);
-       }
+                                              md5, req_to_sk(req), skb);
+       rcu_read_unlock();
 #endif
 
        return skb;
@@ -2970,7 +3031,7 @@ static void tcp_connect_init(struct sock *sk)
                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 
 #ifdef CONFIG_TCP_MD5SIG
-       if (tp->af_specific->md5_lookup(sk, sk) != NULL)
+       if (tp->af_specific->md5_lookup(sk, sk))
                tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
 
@@ -3256,7 +3317,7 @@ void tcp_send_ack(struct sock *sk)
         * sock.
         */
        buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
-       if (buff == NULL) {
+       if (!buff) {
                inet_csk_schedule_ack(sk);
                inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -3300,7 +3361,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 
        /* We don't queue it, tcp_transmit_skb() sets ownership. */
        skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
-       if (skb == NULL)
+       if (!skb)
                return -1;
 
        /* Reserve space for headers and set control bits. */
@@ -3331,8 +3392,8 @@ int tcp_write_wakeup(struct sock *sk)
        if (sk->sk_state == TCP_CLOSE)
                return -1;
 
-       if ((skb = tcp_send_head(sk)) != NULL &&
-           before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+       skb = tcp_send_head(sk);
+       if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
                int err;
                unsigned int mss = tcp_current_mss(sk);
                unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;