]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/tcp_ipv4.c
tcp: extract code to compute SYNACK RTT
[karo-tx-linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
100
101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 {
103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104                                           ip_hdr(skb)->saddr,
105                                           tcp_hdr(skb)->dest,
106                                           tcp_hdr(skb)->source);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112         struct tcp_sock *tp = tcp_sk(sk);
113
114         /* With PAWS, it is safe from the viewpoint
115            of data integrity. Even without PAWS it is safe provided sequence
116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118            Actually, the idea is close to VJ's one, only timestamp cache is
119            held not per host, but per port pair and TW bucket is used as state
120            holder.
121
122            If TW bucket has been already destroyed we fall back to VJ's scheme
123            and use initial timestamp retrieved from peer table.
124          */
125         if (tcptw->tw_ts_recent_stamp &&
126             (twp == NULL || (sysctl_tcp_tw_reuse &&
127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129                 if (tp->write_seq == 0)
130                         tp->write_seq = 1;
131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133                 sock_hold(sktw);
134                 return 1;
135         }
136
137         return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
141 static int tcp_repair_connect(struct sock *sk)
142 {
143         tcp_connect_init(sk);
144         tcp_finish_connect(sk, NULL);
145
146         return 0;
147 }
148
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151 {
152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153         struct inet_sock *inet = inet_sk(sk);
154         struct tcp_sock *tp = tcp_sk(sk);
155         __be16 orig_sport, orig_dport;
156         __be32 daddr, nexthop;
157         struct flowi4 *fl4;
158         struct rtable *rt;
159         int err;
160         struct ip_options_rcu *inet_opt;
161
162         if (addr_len < sizeof(struct sockaddr_in))
163                 return -EINVAL;
164
165         if (usin->sin_family != AF_INET)
166                 return -EAFNOSUPPORT;
167
168         nexthop = daddr = usin->sin_addr.s_addr;
169         inet_opt = rcu_dereference_protected(inet->inet_opt,
170                                              sock_owned_by_user(sk));
171         if (inet_opt && inet_opt->opt.srr) {
172                 if (!daddr)
173                         return -EINVAL;
174                 nexthop = inet_opt->opt.faddr;
175         }
176
177         orig_sport = inet->inet_sport;
178         orig_dport = usin->sin_port;
179         fl4 = &inet->cork.fl.u.ip4;
180         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
181                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
182                               IPPROTO_TCP,
183                               orig_sport, orig_dport, sk, true);
184         if (IS_ERR(rt)) {
185                 err = PTR_ERR(rt);
186                 if (err == -ENETUNREACH)
187                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
188                 return err;
189         }
190
191         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
192                 ip_rt_put(rt);
193                 return -ENETUNREACH;
194         }
195
196         if (!inet_opt || !inet_opt->opt.srr)
197                 daddr = fl4->daddr;
198
199         if (!inet->inet_saddr)
200                 inet->inet_saddr = fl4->saddr;
201         inet->inet_rcv_saddr = inet->inet_saddr;
202
203         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
204                 /* Reset inherited state */
205                 tp->rx_opt.ts_recent       = 0;
206                 tp->rx_opt.ts_recent_stamp = 0;
207                 if (likely(!tp->repair))
208                         tp->write_seq      = 0;
209         }
210
211         if (tcp_death_row.sysctl_tw_recycle &&
212             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
213                 tcp_fetch_timewait_stamp(sk, &rt->dst);
214
215         inet->inet_dport = usin->sin_port;
216         inet->inet_daddr = daddr;
217
218         inet_csk(sk)->icsk_ext_hdr_len = 0;
219         if (inet_opt)
220                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
221
222         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
223
224         /* Socket identity is still unknown (sport may be zero).
225          * However we set state to SYN-SENT and not releasing socket
226          * lock select source port, enter ourselves into the hash tables and
227          * complete initialization after this.
228          */
229         tcp_set_state(sk, TCP_SYN_SENT);
230         err = inet_hash_connect(&tcp_death_row, sk);
231         if (err)
232                 goto failure;
233
234         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
235                                inet->inet_sport, inet->inet_dport, sk);
236         if (IS_ERR(rt)) {
237                 err = PTR_ERR(rt);
238                 rt = NULL;
239                 goto failure;
240         }
241         /* OK, now commit destination to socket.  */
242         sk->sk_gso_type = SKB_GSO_TCPV4;
243         sk_setup_caps(sk, &rt->dst);
244
245         if (!tp->write_seq && likely(!tp->repair))
246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
247                                                            inet->inet_daddr,
248                                                            inet->inet_sport,
249                                                            usin->sin_port);
250
251         inet->inet_id = tp->write_seq ^ jiffies;
252
253         if (likely(!tp->repair))
254                 err = tcp_connect(sk);
255         else
256                 err = tcp_repair_connect(sk);
257
258         rt = NULL;
259         if (err)
260                 goto failure;
261
262         return 0;
263
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->inet_dport = 0;
273         return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 static void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284         struct dst_entry *dst;
285         struct inet_sock *inet = inet_sk(sk);
286         u32 mtu = tcp_sk(sk)->mtu_info;
287
288         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
289          * send out by Linux are always <576bytes so they should go through
290          * unfragmented).
291          */
292         if (sk->sk_state == TCP_LISTEN)
293                 return;
294
295         dst = inet_csk_update_pmtu(sk, mtu);
296         if (!dst)
297                 return;
298
299         /* Something is about to be wrong... Remember soft error
300          * for the case, if this connection will not able to recover.
301          */
302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303                 sk->sk_err_soft = EMSGSIZE;
304
305         mtu = dst_mtu(dst);
306
307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309                 tcp_sync_mss(sk, mtu);
310
311                 /* Resend the TCP packet because it's
312                  * clear that the old packet has been
313                  * dropped. This is the new "fast" path mtu
314                  * discovery.
315                  */
316                 tcp_simple_retransmit(sk);
317         } /* else let the usual retransmit timer handle it */
318 }
319
320 static void do_redirect(struct sk_buff *skb, struct sock *sk)
321 {
322         struct dst_entry *dst = __sk_dst_check(sk, 0);
323
324         if (dst)
325                 dst->ops->redirect(dst, sk, skb);
326 }
327
328 /*
329  * This routine is called by the ICMP module when it gets some
330  * sort of error condition.  If err < 0 then the socket should
331  * be closed and the error returned to the user.  If err > 0
332  * it's just the icmp type << 8 | icmp code.  After adjustment
333  * header points to the first 8 bytes of the tcp header.  We need
334  * to find the appropriate port.
335  *
336  * The locking strategy used here is very "optimistic". When
337  * someone else accesses the socket the ICMP is just dropped
338  * and for some paths there is no check at all.
339  * A more general error queue to queue errors for later handling
340  * is probably better.
341  *
342  */
343
344 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
345 {
346         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
347         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
348         struct inet_connection_sock *icsk;
349         struct tcp_sock *tp;
350         struct inet_sock *inet;
351         const int type = icmp_hdr(icmp_skb)->type;
352         const int code = icmp_hdr(icmp_skb)->code;
353         struct sock *sk;
354         struct sk_buff *skb;
355         struct request_sock *req;
356         __u32 seq;
357         __u32 remaining;
358         int err;
359         struct net *net = dev_net(icmp_skb->dev);
360
361         if (icmp_skb->len < (iph->ihl << 2) + 8) {
362                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
363                 return;
364         }
365
366         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
367                         iph->saddr, th->source, inet_iif(icmp_skb));
368         if (!sk) {
369                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
370                 return;
371         }
372         if (sk->sk_state == TCP_TIME_WAIT) {
373                 inet_twsk_put(inet_twsk(sk));
374                 return;
375         }
376
377         bh_lock_sock(sk);
378         /* If too many ICMPs get dropped on busy
379          * servers this needs to be solved differently.
380          * We do take care of PMTU discovery (RFC1191) special case :
381          * we can receive locally generated ICMP messages while socket is held.
382          */
383         if (sock_owned_by_user(sk) &&
384             type != ICMP_DEST_UNREACH &&
385             code != ICMP_FRAG_NEEDED)
386                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
387
388         if (sk->sk_state == TCP_CLOSE)
389                 goto out;
390
391         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
392                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
393                 goto out;
394         }
395
396         icsk = inet_csk(sk);
397         tp = tcp_sk(sk);
398         req = tp->fastopen_rsk;
399         seq = ntohl(th->seq);
400         if (sk->sk_state != TCP_LISTEN &&
401             !between(seq, tp->snd_una, tp->snd_nxt) &&
402             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
403                 /* For a Fast Open socket, allow seq to be snt_isn. */
404                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
405                 goto out;
406         }
407
408         switch (type) {
409         case ICMP_REDIRECT:
410                 do_redirect(icmp_skb, sk);
411                 goto out;
412         case ICMP_SOURCE_QUENCH:
413                 /* Just silently ignore these. */
414                 goto out;
415         case ICMP_PARAMETERPROB:
416                 err = EPROTO;
417                 break;
418         case ICMP_DEST_UNREACH:
419                 if (code > NR_ICMP_UNREACH)
420                         goto out;
421
422                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
423                         tp->mtu_info = info;
424                         if (!sock_owned_by_user(sk)) {
425                                 tcp_v4_mtu_reduced(sk);
426                         } else {
427                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
428                                         sock_hold(sk);
429                         }
430                         goto out;
431                 }
432
433                 err = icmp_err_convert[code].errno;
434                 /* check if icmp_skb allows revert of backoff
435                  * (see draft-zimmermann-tcp-lcd) */
436                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
437                         break;
438                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
439                     !icsk->icsk_backoff)
440                         break;
441
442                 /* XXX (TFO) - revisit the following logic for TFO */
443
444                 if (sock_owned_by_user(sk))
445                         break;
446
447                 icsk->icsk_backoff--;
448                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
449                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
450                 tcp_bound_rto(sk);
451
452                 skb = tcp_write_queue_head(sk);
453                 BUG_ON(!skb);
454
455                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
456                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
457
458                 if (remaining) {
459                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
460                                                   remaining, TCP_RTO_MAX);
461                 } else {
462                         /* RTO revert clocked out retransmission.
463                          * Will retransmit now */
464                         tcp_retransmit_timer(sk);
465                 }
466
467                 break;
468         case ICMP_TIME_EXCEEDED:
469                 err = EHOSTUNREACH;
470                 break;
471         default:
472                 goto out;
473         }
474
475         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
476          * than following the TCP_SYN_RECV case and closing the socket,
477          * we ignore the ICMP error and keep trying like a fully established
478          * socket. Is this the right thing to do?
479          */
480         if (req && req->sk == NULL)
481                 goto out;
482
483         switch (sk->sk_state) {
484                 struct request_sock *req, **prev;
485         case TCP_LISTEN:
486                 if (sock_owned_by_user(sk))
487                         goto out;
488
489                 req = inet_csk_search_req(sk, &prev, th->dest,
490                                           iph->daddr, iph->saddr);
491                 if (!req)
492                         goto out;
493
494                 /* ICMPs are not backlogged, hence we cannot get
495                    an established socket here.
496                  */
497                 WARN_ON(req->sk);
498
499                 if (seq != tcp_rsk(req)->snt_isn) {
500                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
501                         goto out;
502                 }
503
504                 /*
505                  * Still in SYN_RECV, just remove it silently.
506                  * There is no good way to pass the error to the newly
507                  * created socket, and POSIX does not want network
508                  * errors returned from accept().
509                  */
510                 inet_csk_reqsk_queue_drop(sk, req, prev);
511                 goto out;
512
513         case TCP_SYN_SENT:
514         case TCP_SYN_RECV:  /* Cannot happen.
515                                It can f.e. if SYNs crossed,
516                                or Fast Open.
517                              */
518                 if (!sock_owned_by_user(sk)) {
519                         sk->sk_err = err;
520
521                         sk->sk_error_report(sk);
522
523                         tcp_done(sk);
524                 } else {
525                         sk->sk_err_soft = err;
526                 }
527                 goto out;
528         }
529
530         /* If we've already connected we will keep trying
531          * until we time out, or the user gives up.
532          *
533          * rfc1122 4.2.3.9 allows to consider as hard errors
534          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
535          * but it is obsoleted by pmtu discovery).
536          *
537          * Note, that in modern internet, where routing is unreliable
538          * and in each dark corner broken firewalls sit, sending random
539          * errors ordered by their masters even this two messages finally lose
540          * their original sense (even Linux sends invalid PORT_UNREACHs)
541          *
542          * Now we are in compliance with RFCs.
543          *                                                      --ANK (980905)
544          */
545
546         inet = inet_sk(sk);
547         if (!sock_owned_by_user(sk) && inet->recverr) {
548                 sk->sk_err = err;
549                 sk->sk_error_report(sk);
550         } else  { /* Only an error on timeout */
551                 sk->sk_err_soft = err;
552         }
553
554 out:
555         bh_unlock_sock(sk);
556         sock_put(sk);
557 }
558
559 static void __tcp_v4_send_check(struct sk_buff *skb,
560                                 __be32 saddr, __be32 daddr)
561 {
562         struct tcphdr *th = tcp_hdr(skb);
563
564         if (skb->ip_summed == CHECKSUM_PARTIAL) {
565                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566                 skb->csum_start = skb_transport_header(skb) - skb->head;
567                 skb->csum_offset = offsetof(struct tcphdr, check);
568         } else {
569                 th->check = tcp_v4_check(skb->len, saddr, daddr,
570                                          csum_partial(th,
571                                                       th->doff << 2,
572                                                       skb->csum));
573         }
574 }
575
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579         const struct inet_sock *inet = inet_sk(sk);
580
581         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
583 EXPORT_SYMBOL(tcp_v4_send_check);
584
585 int tcp_v4_gso_send_check(struct sk_buff *skb)
586 {
587         const struct iphdr *iph;
588         struct tcphdr *th;
589
590         if (!pskb_may_pull(skb, sizeof(*th)))
591                 return -EINVAL;
592
593         iph = ip_hdr(skb);
594         th = tcp_hdr(skb);
595
596         th->check = 0;
597         skb->ip_summed = CHECKSUM_PARTIAL;
598         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
599         return 0;
600 }
601
602 /*
603  *      This routine will send an RST to the other tcp.
604  *
605  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
606  *                    for reset.
607  *      Answer: if a packet caused RST, it is not for a socket
608  *              existing in our system, if it is matched to a socket,
609  *              it is just duplicate segment or bug in other side's TCP.
610  *              So that we build reply only basing on parameters
611  *              arrived with segment.
612  *      Exception: precedence violation. We do not implement it in any case.
613  */
614
615 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
616 {
617         const struct tcphdr *th = tcp_hdr(skb);
618         struct {
619                 struct tcphdr th;
620 #ifdef CONFIG_TCP_MD5SIG
621                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
622 #endif
623         } rep;
624         struct ip_reply_arg arg;
625 #ifdef CONFIG_TCP_MD5SIG
626         struct tcp_md5sig_key *key;
627         const __u8 *hash_location = NULL;
628         unsigned char newhash[16];
629         int genhash;
630         struct sock *sk1 = NULL;
631 #endif
632         struct net *net;
633
634         /* Never send a reset in response to a reset. */
635         if (th->rst)
636                 return;
637
638         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
639                 return;
640
641         /* Swap the send and the receive. */
642         memset(&rep, 0, sizeof(rep));
643         rep.th.dest   = th->source;
644         rep.th.source = th->dest;
645         rep.th.doff   = sizeof(struct tcphdr) / 4;
646         rep.th.rst    = 1;
647
648         if (th->ack) {
649                 rep.th.seq = th->ack_seq;
650         } else {
651                 rep.th.ack = 1;
652                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
653                                        skb->len - (th->doff << 2));
654         }
655
656         memset(&arg, 0, sizeof(arg));
657         arg.iov[0].iov_base = (unsigned char *)&rep;
658         arg.iov[0].iov_len  = sizeof(rep.th);
659
660 #ifdef CONFIG_TCP_MD5SIG
661         hash_location = tcp_parse_md5sig_option(th);
662         if (!sk && hash_location) {
663                 /*
664                  * active side is lost. Try to find listening socket through
665                  * source port, and then find md5 key through listening socket.
666                  * we are not loose security here:
667                  * Incoming packet is checked with md5 hash with finding key,
668                  * no RST generated if md5 hash doesn't match.
669                  */
670                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
671                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
672                                              ntohs(th->source), inet_iif(skb));
673                 /* don't send rst if it can't find key */
674                 if (!sk1)
675                         return;
676                 rcu_read_lock();
677                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
678                                         &ip_hdr(skb)->saddr, AF_INET);
679                 if (!key)
680                         goto release_sk1;
681
682                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
683                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
684                         goto release_sk1;
685         } else {
686                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
687                                              &ip_hdr(skb)->saddr,
688                                              AF_INET) : NULL;
689         }
690
691         if (key) {
692                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
693                                    (TCPOPT_NOP << 16) |
694                                    (TCPOPT_MD5SIG << 8) |
695                                    TCPOLEN_MD5SIG);
696                 /* Update length and the length the header thinks exists */
697                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
698                 rep.th.doff = arg.iov[0].iov_len / 4;
699
700                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
701                                      key, ip_hdr(skb)->saddr,
702                                      ip_hdr(skb)->daddr, &rep.th);
703         }
704 #endif
705         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
706                                       ip_hdr(skb)->saddr, /* XXX */
707                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
708         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
709         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
710         /* When socket is gone, all binding information is lost.
711          * routing might fail in this case. using iif for oif to
712          * make sure we can deliver it
713          */
714         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
715
716         net = dev_net(skb_dst(skb)->dev);
717         arg.tos = ip_hdr(skb)->tos;
718         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
719                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
720
721         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
722         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
723
724 #ifdef CONFIG_TCP_MD5SIG
725 release_sk1:
726         if (sk1) {
727                 rcu_read_unlock();
728                 sock_put(sk1);
729         }
730 #endif
731 }
732
733 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
734    outside socket context is ugly, certainly. What can I do?
735  */
736
737 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
738                             u32 win, u32 ts, int oif,
739                             struct tcp_md5sig_key *key,
740                             int reply_flags, u8 tos)
741 {
742         const struct tcphdr *th = tcp_hdr(skb);
743         struct {
744                 struct tcphdr th;
745                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
746 #ifdef CONFIG_TCP_MD5SIG
747                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
748 #endif
749                         ];
750         } rep;
751         struct ip_reply_arg arg;
752         struct net *net = dev_net(skb_dst(skb)->dev);
753
754         memset(&rep.th, 0, sizeof(struct tcphdr));
755         memset(&arg, 0, sizeof(arg));
756
757         arg.iov[0].iov_base = (unsigned char *)&rep;
758         arg.iov[0].iov_len  = sizeof(rep.th);
759         if (ts) {
760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
761                                    (TCPOPT_TIMESTAMP << 8) |
762                                    TCPOLEN_TIMESTAMP);
763                 rep.opt[1] = htonl(tcp_time_stamp);
764                 rep.opt[2] = htonl(ts);
765                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
766         }
767
768         /* Swap the send and the receive. */
769         rep.th.dest    = th->source;
770         rep.th.source  = th->dest;
771         rep.th.doff    = arg.iov[0].iov_len / 4;
772         rep.th.seq     = htonl(seq);
773         rep.th.ack_seq = htonl(ack);
774         rep.th.ack     = 1;
775         rep.th.window  = htons(win);
776
777 #ifdef CONFIG_TCP_MD5SIG
778         if (key) {
779                 int offset = (ts) ? 3 : 0;
780
781                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
782                                           (TCPOPT_NOP << 16) |
783                                           (TCPOPT_MD5SIG << 8) |
784                                           TCPOLEN_MD5SIG);
785                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
786                 rep.th.doff = arg.iov[0].iov_len/4;
787
788                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
789                                     key, ip_hdr(skb)->saddr,
790                                     ip_hdr(skb)->daddr, &rep.th);
791         }
792 #endif
793         arg.flags = reply_flags;
794         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
795                                       ip_hdr(skb)->saddr, /* XXX */
796                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
797         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
798         if (oif)
799                 arg.bound_dev_if = oif;
800         arg.tos = tos;
801         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
802                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
803
804         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
805 }
806
807 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
808 {
809         struct inet_timewait_sock *tw = inet_twsk(sk);
810         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
811
812         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
813                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
814                         tcptw->tw_ts_recent,
815                         tw->tw_bound_dev_if,
816                         tcp_twsk_md5_key(tcptw),
817                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
818                         tw->tw_tos
819                         );
820
821         inet_twsk_put(tw);
822 }
823
824 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
825                                   struct request_sock *req)
826 {
827         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
828          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
829          */
830         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
831                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
832                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
833                         req->ts_recent,
834                         0,
835                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
836                                           AF_INET),
837                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
838                         ip_hdr(skb)->tos);
839 }
840
841 /*
842  *      Send a SYN-ACK after having received a SYN.
843  *      This still operates on a request_sock only, not on a big
844  *      socket.
845  */
846 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
847                               struct request_sock *req,
848                               struct request_values *rvp,
849                               u16 queue_mapping,
850                               bool nocache)
851 {
852         const struct inet_request_sock *ireq = inet_rsk(req);
853         struct flowi4 fl4;
854         int err = -1;
855         struct sk_buff * skb;
856
857         /* First, grab a route. */
858         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
859                 return -1;
860
861         skb = tcp_make_synack(sk, dst, req, rvp, NULL);
862
863         if (skb) {
864                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
865
866                 skb_set_queue_mapping(skb, queue_mapping);
867                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
868                                             ireq->rmt_addr,
869                                             ireq->opt);
870                 err = net_xmit_eval(err);
871         }
872
873         return err;
874 }
875
876 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
877                               struct request_values *rvp)
878 {
879         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
880         return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
881 }
882
883 /*
884  *      IPv4 request_sock destructor.
885  */
886 static void tcp_v4_reqsk_destructor(struct request_sock *req)
887 {
888         kfree(inet_rsk(req)->opt);
889 }
890
891 /*
892  * Return true if a syncookie should be sent
893  */
894 bool tcp_syn_flood_action(struct sock *sk,
895                          const struct sk_buff *skb,
896                          const char *proto)
897 {
898         const char *msg = "Dropping request";
899         bool want_cookie = false;
900         struct listen_sock *lopt;
901
902
903
904 #ifdef CONFIG_SYN_COOKIES
905         if (sysctl_tcp_syncookies) {
906                 msg = "Sending cookies";
907                 want_cookie = true;
908                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
909         } else
910 #endif
911                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
912
913         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
914         if (!lopt->synflood_warned) {
915                 lopt->synflood_warned = 1;
916                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
917                         proto, ntohs(tcp_hdr(skb)->dest), msg);
918         }
919         return want_cookie;
920 }
921 EXPORT_SYMBOL(tcp_syn_flood_action);
922
923 /*
924  * Save and compile IPv4 options into the request_sock if needed.
925  */
926 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
927                                                   struct sk_buff *skb)
928 {
929         const struct ip_options *opt = &(IPCB(skb)->opt);
930         struct ip_options_rcu *dopt = NULL;
931
932         if (opt && opt->optlen) {
933                 int opt_size = sizeof(*dopt) + opt->optlen;
934
935                 dopt = kmalloc(opt_size, GFP_ATOMIC);
936                 if (dopt) {
937                         if (ip_options_echo(&dopt->opt, skb)) {
938                                 kfree(dopt);
939                                 dopt = NULL;
940                         }
941                 }
942         }
943         return dopt;
944 }
945
946 #ifdef CONFIG_TCP_MD5SIG
947 /*
948  * RFC2385 MD5 checksumming requires a mapping of
949  * IP address->MD5 Key.
950  * We need to maintain these in the sk structure.
951  */
952
953 /* Find the Key structure for an address.  */
954 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
955                                          const union tcp_md5_addr *addr,
956                                          int family)
957 {
958         struct tcp_sock *tp = tcp_sk(sk);
959         struct tcp_md5sig_key *key;
960         struct hlist_node *pos;
961         unsigned int size = sizeof(struct in_addr);
962         struct tcp_md5sig_info *md5sig;
963
964         /* caller either holds rcu_read_lock() or socket lock */
965         md5sig = rcu_dereference_check(tp->md5sig_info,
966                                        sock_owned_by_user(sk) ||
967                                        lockdep_is_held(&sk->sk_lock.slock));
968         if (!md5sig)
969                 return NULL;
970 #if IS_ENABLED(CONFIG_IPV6)
971         if (family == AF_INET6)
972                 size = sizeof(struct in6_addr);
973 #endif
974         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
975                 if (key->family != family)
976                         continue;
977                 if (!memcmp(&key->addr, addr, size))
978                         return key;
979         }
980         return NULL;
981 }
982 EXPORT_SYMBOL(tcp_md5_do_lookup);
983
984 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
985                                          struct sock *addr_sk)
986 {
987         union tcp_md5_addr *addr;
988
989         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
990         return tcp_md5_do_lookup(sk, addr, AF_INET);
991 }
992 EXPORT_SYMBOL(tcp_v4_md5_lookup);
993
994 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
995                                                       struct request_sock *req)
996 {
997         union tcp_md5_addr *addr;
998
999         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1000         return tcp_md5_do_lookup(sk, addr, AF_INET);
1001 }
1002
1003 /* This can be called on a newly created socket, from other files */
1004 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1005                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1006 {
1007         /* Add Key to the list */
1008         struct tcp_md5sig_key *key;
1009         struct tcp_sock *tp = tcp_sk(sk);
1010         struct tcp_md5sig_info *md5sig;
1011
1012         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1013         if (key) {
1014                 /* Pre-existing entry - just update that one. */
1015                 memcpy(key->key, newkey, newkeylen);
1016                 key->keylen = newkeylen;
1017                 return 0;
1018         }
1019
1020         md5sig = rcu_dereference_protected(tp->md5sig_info,
1021                                            sock_owned_by_user(sk));
1022         if (!md5sig) {
1023                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1024                 if (!md5sig)
1025                         return -ENOMEM;
1026
1027                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028                 INIT_HLIST_HEAD(&md5sig->head);
1029                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1030         }
1031
1032         key = sock_kmalloc(sk, sizeof(*key), gfp);
1033         if (!key)
1034                 return -ENOMEM;
1035         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1036                 sock_kfree_s(sk, key, sizeof(*key));
1037                 return -ENOMEM;
1038         }
1039
1040         memcpy(key->key, newkey, newkeylen);
1041         key->keylen = newkeylen;
1042         key->family = family;
1043         memcpy(&key->addr, addr,
1044                (family == AF_INET6) ? sizeof(struct in6_addr) :
1045                                       sizeof(struct in_addr));
1046         hlist_add_head_rcu(&key->node, &md5sig->head);
1047         return 0;
1048 }
1049 EXPORT_SYMBOL(tcp_md5_do_add);
1050
1051 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1052 {
1053         struct tcp_sock *tp = tcp_sk(sk);
1054         struct tcp_md5sig_key *key;
1055         struct tcp_md5sig_info *md5sig;
1056
1057         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1058         if (!key)
1059                 return -ENOENT;
1060         hlist_del_rcu(&key->node);
1061         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1062         kfree_rcu(key, rcu);
1063         md5sig = rcu_dereference_protected(tp->md5sig_info,
1064                                            sock_owned_by_user(sk));
1065         if (hlist_empty(&md5sig->head))
1066                 tcp_free_md5sig_pool();
1067         return 0;
1068 }
1069 EXPORT_SYMBOL(tcp_md5_do_del);
1070
1071 void tcp_clear_md5_list(struct sock *sk)
1072 {
1073         struct tcp_sock *tp = tcp_sk(sk);
1074         struct tcp_md5sig_key *key;
1075         struct hlist_node *pos, *n;
1076         struct tcp_md5sig_info *md5sig;
1077
1078         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1079
1080         if (!hlist_empty(&md5sig->head))
1081                 tcp_free_md5sig_pool();
1082         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1083                 hlist_del_rcu(&key->node);
1084                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1085                 kfree_rcu(key, rcu);
1086         }
1087 }
1088
1089 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1090                                  int optlen)
1091 {
1092         struct tcp_md5sig cmd;
1093         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1094
1095         if (optlen < sizeof(cmd))
1096                 return -EINVAL;
1097
1098         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1099                 return -EFAULT;
1100
1101         if (sin->sin_family != AF_INET)
1102                 return -EINVAL;
1103
1104         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1105                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1106                                       AF_INET);
1107
1108         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1109                 return -EINVAL;
1110
1111         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1112                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1113                               GFP_KERNEL);
1114 }
1115
1116 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1117                                         __be32 daddr, __be32 saddr, int nbytes)
1118 {
1119         struct tcp4_pseudohdr *bp;
1120         struct scatterlist sg;
1121
1122         bp = &hp->md5_blk.ip4;
1123
1124         /*
1125          * 1. the TCP pseudo-header (in the order: source IP address,
1126          * destination IP address, zero-padded protocol number, and
1127          * segment length)
1128          */
1129         bp->saddr = saddr;
1130         bp->daddr = daddr;
1131         bp->pad = 0;
1132         bp->protocol = IPPROTO_TCP;
1133         bp->len = cpu_to_be16(nbytes);
1134
1135         sg_init_one(&sg, bp, sizeof(*bp));
1136         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1137 }
1138
1139 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1140                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1141 {
1142         struct tcp_md5sig_pool *hp;
1143         struct hash_desc *desc;
1144
1145         hp = tcp_get_md5sig_pool();
1146         if (!hp)
1147                 goto clear_hash_noput;
1148         desc = &hp->md5_desc;
1149
1150         if (crypto_hash_init(desc))
1151                 goto clear_hash;
1152         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1153                 goto clear_hash;
1154         if (tcp_md5_hash_header(hp, th))
1155                 goto clear_hash;
1156         if (tcp_md5_hash_key(hp, key))
1157                 goto clear_hash;
1158         if (crypto_hash_final(desc, md5_hash))
1159                 goto clear_hash;
1160
1161         tcp_put_md5sig_pool();
1162         return 0;
1163
1164 clear_hash:
1165         tcp_put_md5sig_pool();
1166 clear_hash_noput:
1167         memset(md5_hash, 0, 16);
1168         return 1;
1169 }
1170
1171 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1172                         const struct sock *sk, const struct request_sock *req,
1173                         const struct sk_buff *skb)
1174 {
1175         struct tcp_md5sig_pool *hp;
1176         struct hash_desc *desc;
1177         const struct tcphdr *th = tcp_hdr(skb);
1178         __be32 saddr, daddr;
1179
1180         if (sk) {
1181                 saddr = inet_sk(sk)->inet_saddr;
1182                 daddr = inet_sk(sk)->inet_daddr;
1183         } else if (req) {
1184                 saddr = inet_rsk(req)->loc_addr;
1185                 daddr = inet_rsk(req)->rmt_addr;
1186         } else {
1187                 const struct iphdr *iph = ip_hdr(skb);
1188                 saddr = iph->saddr;
1189                 daddr = iph->daddr;
1190         }
1191
1192         hp = tcp_get_md5sig_pool();
1193         if (!hp)
1194                 goto clear_hash_noput;
1195         desc = &hp->md5_desc;
1196
1197         if (crypto_hash_init(desc))
1198                 goto clear_hash;
1199
1200         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1201                 goto clear_hash;
1202         if (tcp_md5_hash_header(hp, th))
1203                 goto clear_hash;
1204         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1205                 goto clear_hash;
1206         if (tcp_md5_hash_key(hp, key))
1207                 goto clear_hash;
1208         if (crypto_hash_final(desc, md5_hash))
1209                 goto clear_hash;
1210
1211         tcp_put_md5sig_pool();
1212         return 0;
1213
1214 clear_hash:
1215         tcp_put_md5sig_pool();
1216 clear_hash_noput:
1217         memset(md5_hash, 0, 16);
1218         return 1;
1219 }
1220 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1221
1222 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1223 {
1224         /*
1225          * This gets called for each TCP segment that arrives
1226          * so we want to be efficient.
1227          * We have 3 drop cases:
1228          * o No MD5 hash and one expected.
1229          * o MD5 hash and we're not expecting one.
1230          * o MD5 hash and its wrong.
1231          */
1232         const __u8 *hash_location = NULL;
1233         struct tcp_md5sig_key *hash_expected;
1234         const struct iphdr *iph = ip_hdr(skb);
1235         const struct tcphdr *th = tcp_hdr(skb);
1236         int genhash;
1237         unsigned char newhash[16];
1238
1239         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1240                                           AF_INET);
1241         hash_location = tcp_parse_md5sig_option(th);
1242
1243         /* We've parsed the options - do we have a hash? */
1244         if (!hash_expected && !hash_location)
1245                 return false;
1246
1247         if (hash_expected && !hash_location) {
1248                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1249                 return true;
1250         }
1251
1252         if (!hash_expected && hash_location) {
1253                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1254                 return true;
1255         }
1256
1257         /* Okay, so this is hash_expected and hash_location -
1258          * so we need to calculate the checksum.
1259          */
1260         genhash = tcp_v4_md5_hash_skb(newhash,
1261                                       hash_expected,
1262                                       NULL, NULL, skb);
1263
1264         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1265                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1266                                      &iph->saddr, ntohs(th->source),
1267                                      &iph->daddr, ntohs(th->dest),
1268                                      genhash ? " tcp_v4_calc_md5_hash failed"
1269                                      : "");
1270                 return true;
1271         }
1272         return false;
1273 }
1274
1275 #endif
1276
1277 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1278         .family         =       PF_INET,
1279         .obj_size       =       sizeof(struct tcp_request_sock),
1280         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1281         .send_ack       =       tcp_v4_reqsk_send_ack,
1282         .destructor     =       tcp_v4_reqsk_destructor,
1283         .send_reset     =       tcp_v4_send_reset,
1284         .syn_ack_timeout =      tcp_syn_ack_timeout,
1285 };
1286
1287 #ifdef CONFIG_TCP_MD5SIG
1288 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1289         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1290         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1291 };
1292 #endif
1293
1294 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1295                                struct request_sock *req,
1296                                struct tcp_fastopen_cookie *foc,
1297                                struct tcp_fastopen_cookie *valid_foc)
1298 {
1299         bool skip_cookie = false;
1300         struct fastopen_queue *fastopenq;
1301
1302         if (likely(!fastopen_cookie_present(foc))) {
1303                 /* See include/net/tcp.h for the meaning of these knobs */
1304                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1305                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1306                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1307                         skip_cookie = true; /* no cookie to validate */
1308                 else
1309                         return false;
1310         }
1311         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1312         /* A FO option is present; bump the counter. */
1313         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1314
1315         /* Make sure the listener has enabled fastopen, and we don't
1316          * exceed the max # of pending TFO requests allowed before trying
1317          * to validating the cookie in order to avoid burning CPU cycles
1318          * unnecessarily.
1319          *
1320          * XXX (TFO) - The implication of checking the max_qlen before
1321          * processing a cookie request is that clients can't differentiate
1322          * between qlen overflow causing Fast Open to be disabled
1323          * temporarily vs a server not supporting Fast Open at all.
1324          */
1325         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1326             fastopenq == NULL || fastopenq->max_qlen == 0)
1327                 return false;
1328
1329         if (fastopenq->qlen >= fastopenq->max_qlen) {
1330                 struct request_sock *req1;
1331                 spin_lock(&fastopenq->lock);
1332                 req1 = fastopenq->rskq_rst_head;
1333                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1334                         spin_unlock(&fastopenq->lock);
1335                         NET_INC_STATS_BH(sock_net(sk),
1336                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1337                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1338                         foc->len = -1;
1339                         return false;
1340                 }
1341                 fastopenq->rskq_rst_head = req1->dl_next;
1342                 fastopenq->qlen--;
1343                 spin_unlock(&fastopenq->lock);
1344                 reqsk_free(req1);
1345         }
1346         if (skip_cookie) {
1347                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1348                 return true;
1349         }
1350         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1351                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1352                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1353                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1354                             memcmp(&foc->val[0], &valid_foc->val[0],
1355                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1356                                 return false;
1357                         valid_foc->len = -1;
1358                 }
1359                 /* Acknowledge the data received from the peer. */
1360                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1361                 return true;
1362         } else if (foc->len == 0) { /* Client requesting a cookie */
1363                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1364                 NET_INC_STATS_BH(sock_net(sk),
1365                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1366         } else {
1367                 /* Client sent a cookie with wrong size. Treat it
1368                  * the same as invalid and return a valid one.
1369                  */
1370                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1371         }
1372         return false;
1373 }
1374
1375 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1376                                     struct sk_buff *skb,
1377                                     struct sk_buff *skb_synack,
1378                                     struct request_sock *req,
1379                                     struct request_values *rvp)
1380 {
1381         struct tcp_sock *tp = tcp_sk(sk);
1382         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1383         const struct inet_request_sock *ireq = inet_rsk(req);
1384         struct sock *child;
1385
1386         req->retrans = 0;
1387         req->sk = NULL;
1388
1389         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1390         if (child == NULL) {
1391                 NET_INC_STATS_BH(sock_net(sk),
1392                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1393                 kfree_skb(skb_synack);
1394                 return -1;
1395         }
1396         ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1397                         ireq->rmt_addr, ireq->opt);
1398         /* XXX (TFO) - is it ok to ignore error and continue? */
1399
1400         spin_lock(&queue->fastopenq->lock);
1401         queue->fastopenq->qlen++;
1402         spin_unlock(&queue->fastopenq->lock);
1403
1404         /* Initialize the child socket. Have to fix some values to take
1405          * into account the child is a Fast Open socket and is created
1406          * only out of the bits carried in the SYN packet.
1407          */
1408         tp = tcp_sk(child);
1409
1410         tp->fastopen_rsk = req;
1411         /* Do a hold on the listner sk so that if the listener is being
1412          * closed, the child that has been accepted can live on and still
1413          * access listen_lock.
1414          */
1415         sock_hold(sk);
1416         tcp_rsk(req)->listener = sk;
1417
1418         /* RFC1323: The window in SYN & SYN/ACK segments is never
1419          * scaled. So correct it appropriately.
1420          */
1421         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1422
1423         /* Activate the retrans timer so that SYNACK can be retransmitted.
1424          * The request socket is not added to the SYN table of the parent
1425          * because it's been added to the accept queue directly.
1426          */
1427         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1428             TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1429
1430         /* Add the child socket directly into the accept queue */
1431         inet_csk_reqsk_queue_add(sk, req, child);
1432
1433         /* Now finish processing the fastopen child socket. */
1434         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1435         tcp_init_congestion_control(child);
1436         tcp_mtup_init(child);
1437         tcp_init_buffer_space(child);
1438         tcp_init_metrics(child);
1439
1440         /* Queue the data carried in the SYN packet. We need to first
1441          * bump skb's refcnt because the caller will attempt to free it.
1442          *
1443          * XXX (TFO) - we honor a zero-payload TFO request for now.
1444          * (Any reason not to?)
1445          */
1446         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1447                 /* Don't queue the skb if there is no payload in SYN.
1448                  * XXX (TFO) - How about SYN+FIN?
1449                  */
1450                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1451         } else {
1452                 skb = skb_get(skb);
1453                 skb_dst_drop(skb);
1454                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1455                 skb_set_owner_r(skb, child);
1456                 __skb_queue_tail(&child->sk_receive_queue, skb);
1457                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1458         }
1459         sk->sk_data_ready(sk, 0);
1460         bh_unlock_sock(child);
1461         sock_put(child);
1462         WARN_ON(req->sk == NULL);
1463         return 0;
1464 }
1465
1466 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1467 {
1468         struct tcp_extend_values tmp_ext;
1469         struct tcp_options_received tmp_opt;
1470         const u8 *hash_location;
1471         struct request_sock *req;
1472         struct inet_request_sock *ireq;
1473         struct tcp_sock *tp = tcp_sk(sk);
1474         struct dst_entry *dst = NULL;
1475         __be32 saddr = ip_hdr(skb)->saddr;
1476         __be32 daddr = ip_hdr(skb)->daddr;
1477         __u32 isn = TCP_SKB_CB(skb)->when;
1478         bool want_cookie = false;
1479         struct flowi4 fl4;
1480         struct tcp_fastopen_cookie foc = { .len = -1 };
1481         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1482         struct sk_buff *skb_synack;
1483         int do_fastopen;
1484
1485         /* Never answer to SYNs send to broadcast or multicast */
1486         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1487                 goto drop;
1488
1489         /* TW buckets are converted to open requests without
1490          * limitations, they conserve resources and peer is
1491          * evidently real one.
1492          */
1493         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1494                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1495                 if (!want_cookie)
1496                         goto drop;
1497         }
1498
1499         /* Accept backlog is full. If we have already queued enough
1500          * of warm entries in syn queue, drop request. It is better than
1501          * clogging syn queue with openreqs with exponentially increasing
1502          * timeout.
1503          */
1504         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1505                 goto drop;
1506
1507         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1508         if (!req)
1509                 goto drop;
1510
1511 #ifdef CONFIG_TCP_MD5SIG
1512         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1513 #endif
1514
1515         tcp_clear_options(&tmp_opt);
1516         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1517         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1518         tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1519             want_cookie ? NULL : &foc);
1520
1521         if (tmp_opt.cookie_plus > 0 &&
1522             tmp_opt.saw_tstamp &&
1523             !tp->rx_opt.cookie_out_never &&
1524             (sysctl_tcp_cookie_size > 0 ||
1525              (tp->cookie_values != NULL &&
1526               tp->cookie_values->cookie_desired > 0))) {
1527                 u8 *c;
1528                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1529                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1530
1531                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1532                         goto drop_and_release;
1533
1534                 /* Secret recipe starts with IP addresses */
1535                 *mess++ ^= (__force u32)daddr;
1536                 *mess++ ^= (__force u32)saddr;
1537
1538                 /* plus variable length Initiator Cookie */
1539                 c = (u8 *)mess;
1540                 while (l-- > 0)
1541                         *c++ ^= *hash_location++;
1542
1543                 want_cookie = false;    /* not our kind of cookie */
1544                 tmp_ext.cookie_out_never = 0; /* false */
1545                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1546         } else if (!tp->rx_opt.cookie_in_always) {
1547                 /* redundant indications, but ensure initialization. */
1548                 tmp_ext.cookie_out_never = 1; /* true */
1549                 tmp_ext.cookie_plus = 0;
1550         } else {
1551                 goto drop_and_release;
1552         }
1553         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1554
1555         if (want_cookie && !tmp_opt.saw_tstamp)
1556                 tcp_clear_options(&tmp_opt);
1557
1558         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1559         tcp_openreq_init(req, &tmp_opt, skb);
1560
1561         ireq = inet_rsk(req);
1562         ireq->loc_addr = daddr;
1563         ireq->rmt_addr = saddr;
1564         ireq->no_srccheck = inet_sk(sk)->transparent;
1565         ireq->opt = tcp_v4_save_options(sk, skb);
1566
1567         if (security_inet_conn_request(sk, skb, req))
1568                 goto drop_and_free;
1569
1570         if (!want_cookie || tmp_opt.tstamp_ok)
1571                 TCP_ECN_create_request(req, skb);
1572
1573         if (want_cookie) {
1574                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1575                 req->cookie_ts = tmp_opt.tstamp_ok;
1576         } else if (!isn) {
1577                 /* VJ's idea. We save last timestamp seen
1578                  * from the destination in peer table, when entering
1579                  * state TIME-WAIT, and check against it before
1580                  * accepting new connection request.
1581                  *
1582                  * If "isn" is not zero, this request hit alive
1583                  * timewait bucket, so that all the necessary checks
1584                  * are made in the function processing timewait state.
1585                  */
1586                 if (tmp_opt.saw_tstamp &&
1587                     tcp_death_row.sysctl_tw_recycle &&
1588                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1589                     fl4.daddr == saddr) {
1590                         if (!tcp_peer_is_proven(req, dst, true)) {
1591                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1592                                 goto drop_and_release;
1593                         }
1594                 }
1595                 /* Kill the following clause, if you dislike this way. */
1596                 else if (!sysctl_tcp_syncookies &&
1597                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1598                           (sysctl_max_syn_backlog >> 2)) &&
1599                          !tcp_peer_is_proven(req, dst, false)) {
1600                         /* Without syncookies last quarter of
1601                          * backlog is filled with destinations,
1602                          * proven to be alive.
1603                          * It means that we continue to communicate
1604                          * to destinations, already remembered
1605                          * to the moment of synflood.
1606                          */
1607                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1608                                        &saddr, ntohs(tcp_hdr(skb)->source));
1609                         goto drop_and_release;
1610                 }
1611
1612                 isn = tcp_v4_init_sequence(skb);
1613         }
1614         tcp_rsk(req)->snt_isn = isn;
1615         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1616
1617         if (dst == NULL) {
1618                 dst = inet_csk_route_req(sk, &fl4, req);
1619                 if (dst == NULL)
1620                         goto drop_and_free;
1621         }
1622         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1623
1624         /* We don't call tcp_v4_send_synack() directly because we need
1625          * to make sure a child socket can be created successfully before
1626          * sending back synack!
1627          *
1628          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1629          * (or better yet, call tcp_send_synack() in the child context
1630          * directly, but will have to fix bunch of other code first)
1631          * after syn_recv_sock() except one will need to first fix the
1632          * latter to remove its dependency on the current implementation
1633          * of tcp_v4_send_synack()->tcp_select_initial_window().
1634          */
1635         skb_synack = tcp_make_synack(sk, dst, req,
1636             (struct request_values *)&tmp_ext,
1637             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1638
1639         if (skb_synack) {
1640                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1641                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1642         } else
1643                 goto drop_and_free;
1644
1645         if (likely(!do_fastopen)) {
1646                 int err;
1647                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1648                      ireq->rmt_addr, ireq->opt);
1649                 err = net_xmit_eval(err);
1650                 if (err || want_cookie)
1651                         goto drop_and_free;
1652
1653                 tcp_rsk(req)->listener = NULL;
1654                 /* Add the request_sock to the SYN table */
1655                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1656                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1657                         NET_INC_STATS_BH(sock_net(sk),
1658                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1659         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1660             (struct request_values *)&tmp_ext))
1661                 goto drop_and_free;
1662
1663         return 0;
1664
1665 drop_and_release:
1666         dst_release(dst);
1667 drop_and_free:
1668         reqsk_free(req);
1669 drop:
1670         return 0;
1671 }
1672 EXPORT_SYMBOL(tcp_v4_conn_request);
1673
1674
1675 /*
1676  * The three way handshake has completed - we got a valid synack -
1677  * now create the new socket.
1678  */
1679 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1680                                   struct request_sock *req,
1681                                   struct dst_entry *dst)
1682 {
1683         struct inet_request_sock *ireq;
1684         struct inet_sock *newinet;
1685         struct tcp_sock *newtp;
1686         struct sock *newsk;
1687 #ifdef CONFIG_TCP_MD5SIG
1688         struct tcp_md5sig_key *key;
1689 #endif
1690         struct ip_options_rcu *inet_opt;
1691
1692         if (sk_acceptq_is_full(sk))
1693                 goto exit_overflow;
1694
1695         newsk = tcp_create_openreq_child(sk, req, skb);
1696         if (!newsk)
1697                 goto exit_nonewsk;
1698
1699         newsk->sk_gso_type = SKB_GSO_TCPV4;
1700         inet_sk_rx_dst_set(newsk, skb);
1701
1702         newtp                 = tcp_sk(newsk);
1703         newinet               = inet_sk(newsk);
1704         ireq                  = inet_rsk(req);
1705         newinet->inet_daddr   = ireq->rmt_addr;
1706         newinet->inet_rcv_saddr = ireq->loc_addr;
1707         newinet->inet_saddr           = ireq->loc_addr;
1708         inet_opt              = ireq->opt;
1709         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1710         ireq->opt             = NULL;
1711         newinet->mc_index     = inet_iif(skb);
1712         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1713         newinet->rcv_tos      = ip_hdr(skb)->tos;
1714         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1715         if (inet_opt)
1716                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1717         newinet->inet_id = newtp->write_seq ^ jiffies;
1718
1719         if (!dst) {
1720                 dst = inet_csk_route_child_sock(sk, newsk, req);
1721                 if (!dst)
1722                         goto put_and_exit;
1723         } else {
1724                 /* syncookie case : see end of cookie_v4_check() */
1725         }
1726         sk_setup_caps(newsk, dst);
1727
1728         tcp_mtup_init(newsk);
1729         tcp_sync_mss(newsk, dst_mtu(dst));
1730         newtp->advmss = dst_metric_advmss(dst);
1731         if (tcp_sk(sk)->rx_opt.user_mss &&
1732             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1733                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1734
1735         tcp_initialize_rcv_mss(newsk);
1736         tcp_synack_rtt_meas(newsk, req);
1737         newtp->total_retrans = req->retrans;
1738
1739 #ifdef CONFIG_TCP_MD5SIG
1740         /* Copy over the MD5 key from the original socket */
1741         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1742                                 AF_INET);
1743         if (key != NULL) {
1744                 /*
1745                  * We're using one, so create a matching key
1746                  * on the newsk structure. If we fail to get
1747                  * memory, then we end up not copying the key
1748                  * across. Shucks.
1749                  */
1750                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1751                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1752                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1753         }
1754 #endif
1755
1756         if (__inet_inherit_port(sk, newsk) < 0)
1757                 goto put_and_exit;
1758         __inet_hash_nolisten(newsk, NULL);
1759
1760         return newsk;
1761
1762 exit_overflow:
1763         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1764 exit_nonewsk:
1765         dst_release(dst);
1766 exit:
1767         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1768         return NULL;
1769 put_and_exit:
1770         tcp_clear_xmit_timers(newsk);
1771         tcp_cleanup_congestion_control(newsk);
1772         bh_unlock_sock(newsk);
1773         sock_put(newsk);
1774         goto exit;
1775 }
1776 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1777
1778 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1779 {
1780         struct tcphdr *th = tcp_hdr(skb);
1781         const struct iphdr *iph = ip_hdr(skb);
1782         struct sock *nsk;
1783         struct request_sock **prev;
1784         /* Find possible connection requests. */
1785         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1786                                                        iph->saddr, iph->daddr);
1787         if (req)
1788                 return tcp_check_req(sk, skb, req, prev, false);
1789
1790         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1791                         th->source, iph->daddr, th->dest, inet_iif(skb));
1792
1793         if (nsk) {
1794                 if (nsk->sk_state != TCP_TIME_WAIT) {
1795                         bh_lock_sock(nsk);
1796                         return nsk;
1797                 }
1798                 inet_twsk_put(inet_twsk(nsk));
1799                 return NULL;
1800         }
1801
1802 #ifdef CONFIG_SYN_COOKIES
1803         if (!th->syn)
1804                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1805 #endif
1806         return sk;
1807 }
1808
1809 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1810 {
1811         const struct iphdr *iph = ip_hdr(skb);
1812
1813         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1814                 if (!tcp_v4_check(skb->len, iph->saddr,
1815                                   iph->daddr, skb->csum)) {
1816                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1817                         return 0;
1818                 }
1819         }
1820
1821         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1822                                        skb->len, IPPROTO_TCP, 0);
1823
1824         if (skb->len <= 76) {
1825                 return __skb_checksum_complete(skb);
1826         }
1827         return 0;
1828 }
1829
1830
1831 /* The socket must have it's spinlock held when we get
1832  * here.
1833  *
1834  * We have a potential double-lock case here, so even when
1835  * doing backlog processing we use the BH locking scheme.
1836  * This is because we cannot sleep with the original spinlock
1837  * held.
1838  */
1839 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1840 {
1841         struct sock *rsk;
1842 #ifdef CONFIG_TCP_MD5SIG
1843         /*
1844          * We really want to reject the packet as early as possible
1845          * if:
1846          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1847          *  o There is an MD5 option and we're not expecting one
1848          */
1849         if (tcp_v4_inbound_md5_hash(sk, skb))
1850                 goto discard;
1851 #endif
1852
1853         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1854                 struct dst_entry *dst = sk->sk_rx_dst;
1855
1856                 sock_rps_save_rxhash(sk, skb);
1857                 if (dst) {
1858                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1859                             dst->ops->check(dst, 0) == NULL) {
1860                                 dst_release(dst);
1861                                 sk->sk_rx_dst = NULL;
1862                         }
1863                 }
1864                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1865                         rsk = sk;
1866                         goto reset;
1867                 }
1868                 return 0;
1869         }
1870
1871         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1872                 goto csum_err;
1873
1874         if (sk->sk_state == TCP_LISTEN) {
1875                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1876                 if (!nsk)
1877                         goto discard;
1878
1879                 if (nsk != sk) {
1880                         sock_rps_save_rxhash(nsk, skb);
1881                         if (tcp_child_process(sk, nsk, skb)) {
1882                                 rsk = nsk;
1883                                 goto reset;
1884                         }
1885                         return 0;
1886                 }
1887         } else
1888                 sock_rps_save_rxhash(sk, skb);
1889
1890         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1891                 rsk = sk;
1892                 goto reset;
1893         }
1894         return 0;
1895
1896 reset:
1897         tcp_v4_send_reset(rsk, skb);
1898 discard:
1899         kfree_skb(skb);
1900         /* Be careful here. If this function gets more complicated and
1901          * gcc suffers from register pressure on the x86, sk (in %ebx)
1902          * might be destroyed here. This current version compiles correctly,
1903          * but you have been warned.
1904          */
1905         return 0;
1906
1907 csum_err:
1908         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1909         goto discard;
1910 }
1911 EXPORT_SYMBOL(tcp_v4_do_rcv);
1912
1913 void tcp_v4_early_demux(struct sk_buff *skb)
1914 {
1915         struct net *net = dev_net(skb->dev);
1916         const struct iphdr *iph;
1917         const struct tcphdr *th;
1918         struct sock *sk;
1919
1920         if (skb->pkt_type != PACKET_HOST)
1921                 return;
1922
1923         if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1924                 return;
1925
1926         iph = ip_hdr(skb);
1927         th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1928
1929         if (th->doff < sizeof(struct tcphdr) / 4)
1930                 return;
1931
1932         sk = __inet_lookup_established(net, &tcp_hashinfo,
1933                                        iph->saddr, th->source,
1934                                        iph->daddr, ntohs(th->dest),
1935                                        skb->skb_iif);
1936         if (sk) {
1937                 skb->sk = sk;
1938                 skb->destructor = sock_edemux;
1939                 if (sk->sk_state != TCP_TIME_WAIT) {
1940                         struct dst_entry *dst = sk->sk_rx_dst;
1941
1942                         if (dst)
1943                                 dst = dst_check(dst, 0);
1944                         if (dst &&
1945                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1946                                 skb_dst_set_noref(skb, dst);
1947                 }
1948         }
1949 }
1950
1951 /*
1952  *      From tcp_input.c
1953  */
1954
1955 int tcp_v4_rcv(struct sk_buff *skb)
1956 {
1957         const struct iphdr *iph;
1958         const struct tcphdr *th;
1959         struct sock *sk;
1960         int ret;
1961         struct net *net = dev_net(skb->dev);
1962
1963         if (skb->pkt_type != PACKET_HOST)
1964                 goto discard_it;
1965
1966         /* Count it even if it's bad */
1967         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1968
1969         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1970                 goto discard_it;
1971
1972         th = tcp_hdr(skb);
1973
1974         if (th->doff < sizeof(struct tcphdr) / 4)
1975                 goto bad_packet;
1976         if (!pskb_may_pull(skb, th->doff * 4))
1977                 goto discard_it;
1978
1979         /* An explanation is required here, I think.
1980          * Packet length and doff are validated by header prediction,
1981          * provided case of th->doff==0 is eliminated.
1982          * So, we defer the checks. */
1983         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1984                 goto bad_packet;
1985
1986         th = tcp_hdr(skb);
1987         iph = ip_hdr(skb);
1988         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1989         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1990                                     skb->len - th->doff * 4);
1991         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1992         TCP_SKB_CB(skb)->when    = 0;
1993         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1994         TCP_SKB_CB(skb)->sacked  = 0;
1995
1996         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1997         if (!sk)
1998                 goto no_tcp_socket;
1999
2000 process:
2001         if (sk->sk_state == TCP_TIME_WAIT)
2002                 goto do_time_wait;
2003
2004         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2005                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2006                 goto discard_and_relse;
2007         }
2008
2009         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2010                 goto discard_and_relse;
2011         nf_reset(skb);
2012
2013         if (sk_filter(sk, skb))
2014                 goto discard_and_relse;
2015
2016         skb->dev = NULL;
2017
2018         bh_lock_sock_nested(sk);
2019         ret = 0;
2020         if (!sock_owned_by_user(sk)) {
2021 #ifdef CONFIG_NET_DMA
2022                 struct tcp_sock *tp = tcp_sk(sk);
2023                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2024                         tp->ucopy.dma_chan = net_dma_find_channel();
2025                 if (tp->ucopy.dma_chan)
2026                         ret = tcp_v4_do_rcv(sk, skb);
2027                 else
2028 #endif
2029                 {
2030                         if (!tcp_prequeue(sk, skb))
2031                                 ret = tcp_v4_do_rcv(sk, skb);
2032                 }
2033         } else if (unlikely(sk_add_backlog(sk, skb,
2034                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2035                 bh_unlock_sock(sk);
2036                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2037                 goto discard_and_relse;
2038         }
2039         bh_unlock_sock(sk);
2040
2041         sock_put(sk);
2042
2043         return ret;
2044
2045 no_tcp_socket:
2046         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2047                 goto discard_it;
2048
2049         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2050 bad_packet:
2051                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2052         } else {
2053                 tcp_v4_send_reset(NULL, skb);
2054         }
2055
2056 discard_it:
2057         /* Discard frame. */
2058         kfree_skb(skb);
2059         return 0;
2060
2061 discard_and_relse:
2062         sock_put(sk);
2063         goto discard_it;
2064
2065 do_time_wait:
2066         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2067                 inet_twsk_put(inet_twsk(sk));
2068                 goto discard_it;
2069         }
2070
2071         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2072                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2073                 inet_twsk_put(inet_twsk(sk));
2074                 goto discard_it;
2075         }
2076         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2077         case TCP_TW_SYN: {
2078                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2079                                                         &tcp_hashinfo,
2080                                                         iph->daddr, th->dest,
2081                                                         inet_iif(skb));
2082                 if (sk2) {
2083                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2084                         inet_twsk_put(inet_twsk(sk));
2085                         sk = sk2;
2086                         goto process;
2087                 }
2088                 /* Fall through to ACK */
2089         }
2090         case TCP_TW_ACK:
2091                 tcp_v4_timewait_ack(sk, skb);
2092                 break;
2093         case TCP_TW_RST:
2094                 goto no_tcp_socket;
2095         case TCP_TW_SUCCESS:;
2096         }
2097         goto discard_it;
2098 }
2099
2100 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2101         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2102         .twsk_unique    = tcp_twsk_unique,
2103         .twsk_destructor= tcp_twsk_destructor,
2104 };
2105
2106 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2107 {
2108         struct dst_entry *dst = skb_dst(skb);
2109
2110         dst_hold(dst);
2111         sk->sk_rx_dst = dst;
2112         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2113 }
2114 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2115
2116 const struct inet_connection_sock_af_ops ipv4_specific = {
2117         .queue_xmit        = ip_queue_xmit,
2118         .send_check        = tcp_v4_send_check,
2119         .rebuild_header    = inet_sk_rebuild_header,
2120         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2121         .conn_request      = tcp_v4_conn_request,
2122         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2123         .net_header_len    = sizeof(struct iphdr),
2124         .setsockopt        = ip_setsockopt,
2125         .getsockopt        = ip_getsockopt,
2126         .addr2sockaddr     = inet_csk_addr2sockaddr,
2127         .sockaddr_len      = sizeof(struct sockaddr_in),
2128         .bind_conflict     = inet_csk_bind_conflict,
2129 #ifdef CONFIG_COMPAT
2130         .compat_setsockopt = compat_ip_setsockopt,
2131         .compat_getsockopt = compat_ip_getsockopt,
2132 #endif
2133 };
2134 EXPORT_SYMBOL(ipv4_specific);
2135
2136 #ifdef CONFIG_TCP_MD5SIG
2137 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2138         .md5_lookup             = tcp_v4_md5_lookup,
2139         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2140         .md5_parse              = tcp_v4_parse_md5_keys,
2141 };
2142 #endif
2143
2144 /* NOTE: A lot of things set to zero explicitly by call to
2145  *       sk_alloc() so need not be done here.
2146  */
2147 static int tcp_v4_init_sock(struct sock *sk)
2148 {
2149         struct inet_connection_sock *icsk = inet_csk(sk);
2150
2151         tcp_init_sock(sk);
2152
2153         icsk->icsk_af_ops = &ipv4_specific;
2154
2155 #ifdef CONFIG_TCP_MD5SIG
2156         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2157 #endif
2158
2159         return 0;
2160 }
2161
2162 void tcp_v4_destroy_sock(struct sock *sk)
2163 {
2164         struct tcp_sock *tp = tcp_sk(sk);
2165
2166         tcp_clear_xmit_timers(sk);
2167
2168         tcp_cleanup_congestion_control(sk);
2169
2170         /* Cleanup up the write buffer. */
2171         tcp_write_queue_purge(sk);
2172
2173         /* Cleans up our, hopefully empty, out_of_order_queue. */
2174         __skb_queue_purge(&tp->out_of_order_queue);
2175
2176 #ifdef CONFIG_TCP_MD5SIG
2177         /* Clean up the MD5 key list, if any */
2178         if (tp->md5sig_info) {
2179                 tcp_clear_md5_list(sk);
2180                 kfree_rcu(tp->md5sig_info, rcu);
2181                 tp->md5sig_info = NULL;
2182         }
2183 #endif
2184
2185 #ifdef CONFIG_NET_DMA
2186         /* Cleans up our sk_async_wait_queue */
2187         __skb_queue_purge(&sk->sk_async_wait_queue);
2188 #endif
2189
2190         /* Clean prequeue, it must be empty really */
2191         __skb_queue_purge(&tp->ucopy.prequeue);
2192
2193         /* Clean up a referenced TCP bind bucket. */
2194         if (inet_csk(sk)->icsk_bind_hash)
2195                 inet_put_port(sk);
2196
2197         /*
2198          * If sendmsg cached page exists, toss it.
2199          */
2200         if (sk->sk_sndmsg_page) {
2201                 __free_page(sk->sk_sndmsg_page);
2202                 sk->sk_sndmsg_page = NULL;
2203         }
2204
2205         /* TCP Cookie Transactions */
2206         if (tp->cookie_values != NULL) {
2207                 kref_put(&tp->cookie_values->kref,
2208                          tcp_cookie_values_release);
2209                 tp->cookie_values = NULL;
2210         }
2211         BUG_ON(tp->fastopen_rsk != NULL);
2212
2213         /* If socket is aborted during connect operation */
2214         tcp_free_fastopen_req(tp);
2215
2216         sk_sockets_allocated_dec(sk);
2217         sock_release_memcg(sk);
2218 }
2219 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2220
2221 #ifdef CONFIG_PROC_FS
2222 /* Proc filesystem TCP sock list dumping. */
2223
2224 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2225 {
2226         return hlist_nulls_empty(head) ? NULL :
2227                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2228 }
2229
2230 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2231 {
2232         return !is_a_nulls(tw->tw_node.next) ?
2233                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2234 }
2235
2236 /*
2237  * Get next listener socket follow cur.  If cur is NULL, get first socket
2238  * starting from bucket given in st->bucket; when st->bucket is zero the
2239  * very first socket in the hash table is returned.
2240  */
2241 static void *listening_get_next(struct seq_file *seq, void *cur)
2242 {
2243         struct inet_connection_sock *icsk;
2244         struct hlist_nulls_node *node;
2245         struct sock *sk = cur;
2246         struct inet_listen_hashbucket *ilb;
2247         struct tcp_iter_state *st = seq->private;
2248         struct net *net = seq_file_net(seq);
2249
2250         if (!sk) {
2251                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2252                 spin_lock_bh(&ilb->lock);
2253                 sk = sk_nulls_head(&ilb->head);
2254                 st->offset = 0;
2255                 goto get_sk;
2256         }
2257         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2258         ++st->num;
2259         ++st->offset;
2260
2261         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2262                 struct request_sock *req = cur;
2263
2264                 icsk = inet_csk(st->syn_wait_sk);
2265                 req = req->dl_next;
2266                 while (1) {
2267                         while (req) {
2268                                 if (req->rsk_ops->family == st->family) {
2269                                         cur = req;
2270                                         goto out;
2271                                 }
2272                                 req = req->dl_next;
2273                         }
2274                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2275                                 break;
2276 get_req:
2277                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2278                 }
2279                 sk        = sk_nulls_next(st->syn_wait_sk);
2280                 st->state = TCP_SEQ_STATE_LISTENING;
2281                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2282         } else {
2283                 icsk = inet_csk(sk);
2284                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2285                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2286                         goto start_req;
2287                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2288                 sk = sk_nulls_next(sk);
2289         }
2290 get_sk:
2291         sk_nulls_for_each_from(sk, node) {
2292                 if (!net_eq(sock_net(sk), net))
2293                         continue;
2294                 if (sk->sk_family == st->family) {
2295                         cur = sk;
2296                         goto out;
2297                 }
2298                 icsk = inet_csk(sk);
2299                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2300                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2301 start_req:
2302                         st->uid         = sock_i_uid(sk);
2303                         st->syn_wait_sk = sk;
2304                         st->state       = TCP_SEQ_STATE_OPENREQ;
2305                         st->sbucket     = 0;
2306                         goto get_req;
2307                 }
2308                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2309         }
2310         spin_unlock_bh(&ilb->lock);
2311         st->offset = 0;
2312         if (++st->bucket < INET_LHTABLE_SIZE) {
2313                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2314                 spin_lock_bh(&ilb->lock);
2315                 sk = sk_nulls_head(&ilb->head);
2316                 goto get_sk;
2317         }
2318         cur = NULL;
2319 out:
2320         return cur;
2321 }
2322
2323 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2324 {
2325         struct tcp_iter_state *st = seq->private;
2326         void *rc;
2327
2328         st->bucket = 0;
2329         st->offset = 0;
2330         rc = listening_get_next(seq, NULL);
2331
2332         while (rc && *pos) {
2333                 rc = listening_get_next(seq, rc);
2334                 --*pos;
2335         }
2336         return rc;
2337 }
2338
2339 static inline bool empty_bucket(struct tcp_iter_state *st)
2340 {
2341         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2342                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2343 }
2344
2345 /*
2346  * Get first established socket starting from bucket given in st->bucket.
2347  * If st->bucket is zero, the very first socket in the hash is returned.
2348  */
2349 static void *established_get_first(struct seq_file *seq)
2350 {
2351         struct tcp_iter_state *st = seq->private;
2352         struct net *net = seq_file_net(seq);
2353         void *rc = NULL;
2354
2355         st->offset = 0;
2356         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2357                 struct sock *sk;
2358                 struct hlist_nulls_node *node;
2359                 struct inet_timewait_sock *tw;
2360                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2361
2362                 /* Lockless fast path for the common case of empty buckets */
2363                 if (empty_bucket(st))
2364                         continue;
2365
2366                 spin_lock_bh(lock);
2367                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2368                         if (sk->sk_family != st->family ||
2369                             !net_eq(sock_net(sk), net)) {
2370                                 continue;
2371                         }
2372                         rc = sk;
2373                         goto out;
2374                 }
2375                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2376                 inet_twsk_for_each(tw, node,
2377                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2378                         if (tw->tw_family != st->family ||
2379                             !net_eq(twsk_net(tw), net)) {
2380                                 continue;
2381                         }
2382                         rc = tw;
2383                         goto out;
2384                 }
2385                 spin_unlock_bh(lock);
2386                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2387         }
2388 out:
2389         return rc;
2390 }
2391
2392 static void *established_get_next(struct seq_file *seq, void *cur)
2393 {
2394         struct sock *sk = cur;
2395         struct inet_timewait_sock *tw;
2396         struct hlist_nulls_node *node;
2397         struct tcp_iter_state *st = seq->private;
2398         struct net *net = seq_file_net(seq);
2399
2400         ++st->num;
2401         ++st->offset;
2402
2403         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2404                 tw = cur;
2405                 tw = tw_next(tw);
2406 get_tw:
2407                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2408                         tw = tw_next(tw);
2409                 }
2410                 if (tw) {
2411                         cur = tw;
2412                         goto out;
2413                 }
2414                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2415                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2416
2417                 /* Look for next non empty bucket */
2418                 st->offset = 0;
2419                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2420                                 empty_bucket(st))
2421                         ;
2422                 if (st->bucket > tcp_hashinfo.ehash_mask)
2423                         return NULL;
2424
2425                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2426                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2427         } else
2428                 sk = sk_nulls_next(sk);
2429
2430         sk_nulls_for_each_from(sk, node) {
2431                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2432                         goto found;
2433         }
2434
2435         st->state = TCP_SEQ_STATE_TIME_WAIT;
2436         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2437         goto get_tw;
2438 found:
2439         cur = sk;
2440 out:
2441         return cur;
2442 }
2443
2444 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2445 {
2446         struct tcp_iter_state *st = seq->private;
2447         void *rc;
2448
2449         st->bucket = 0;
2450         rc = established_get_first(seq);
2451
2452         while (rc && pos) {
2453                 rc = established_get_next(seq, rc);
2454                 --pos;
2455         }
2456         return rc;
2457 }
2458
2459 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2460 {
2461         void *rc;
2462         struct tcp_iter_state *st = seq->private;
2463
2464         st->state = TCP_SEQ_STATE_LISTENING;
2465         rc        = listening_get_idx(seq, &pos);
2466
2467         if (!rc) {
2468                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2469                 rc        = established_get_idx(seq, pos);
2470         }
2471
2472         return rc;
2473 }
2474
2475 static void *tcp_seek_last_pos(struct seq_file *seq)
2476 {
2477         struct tcp_iter_state *st = seq->private;
2478         int offset = st->offset;
2479         int orig_num = st->num;
2480         void *rc = NULL;
2481
2482         switch (st->state) {
2483         case TCP_SEQ_STATE_OPENREQ:
2484         case TCP_SEQ_STATE_LISTENING:
2485                 if (st->bucket >= INET_LHTABLE_SIZE)
2486                         break;
2487                 st->state = TCP_SEQ_STATE_LISTENING;
2488                 rc = listening_get_next(seq, NULL);
2489                 while (offset-- && rc)
2490                         rc = listening_get_next(seq, rc);
2491                 if (rc)
2492                         break;
2493                 st->bucket = 0;
2494                 /* Fallthrough */
2495         case TCP_SEQ_STATE_ESTABLISHED:
2496         case TCP_SEQ_STATE_TIME_WAIT:
2497                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2498                 if (st->bucket > tcp_hashinfo.ehash_mask)
2499                         break;
2500                 rc = established_get_first(seq);
2501                 while (offset-- && rc)
2502                         rc = established_get_next(seq, rc);
2503         }
2504
2505         st->num = orig_num;
2506
2507         return rc;
2508 }
2509
2510 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2511 {
2512         struct tcp_iter_state *st = seq->private;
2513         void *rc;
2514
2515         if (*pos && *pos == st->last_pos) {
2516                 rc = tcp_seek_last_pos(seq);
2517                 if (rc)
2518                         goto out;
2519         }
2520
2521         st->state = TCP_SEQ_STATE_LISTENING;
2522         st->num = 0;
2523         st->bucket = 0;
2524         st->offset = 0;
2525         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2526
2527 out:
2528         st->last_pos = *pos;
2529         return rc;
2530 }
2531
2532 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2533 {
2534         struct tcp_iter_state *st = seq->private;
2535         void *rc = NULL;
2536
2537         if (v == SEQ_START_TOKEN) {
2538                 rc = tcp_get_idx(seq, 0);
2539                 goto out;
2540         }
2541
2542         switch (st->state) {
2543         case TCP_SEQ_STATE_OPENREQ:
2544         case TCP_SEQ_STATE_LISTENING:
2545                 rc = listening_get_next(seq, v);
2546                 if (!rc) {
2547                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2548                         st->bucket = 0;
2549                         st->offset = 0;
2550                         rc        = established_get_first(seq);
2551                 }
2552                 break;
2553         case TCP_SEQ_STATE_ESTABLISHED:
2554         case TCP_SEQ_STATE_TIME_WAIT:
2555                 rc = established_get_next(seq, v);
2556                 break;
2557         }
2558 out:
2559         ++*pos;
2560         st->last_pos = *pos;
2561         return rc;
2562 }
2563
2564 static void tcp_seq_stop(struct seq_file *seq, void *v)
2565 {
2566         struct tcp_iter_state *st = seq->private;
2567
2568         switch (st->state) {
2569         case TCP_SEQ_STATE_OPENREQ:
2570                 if (v) {
2571                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2572                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2573                 }
2574         case TCP_SEQ_STATE_LISTENING:
2575                 if (v != SEQ_START_TOKEN)
2576                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2577                 break;
2578         case TCP_SEQ_STATE_TIME_WAIT:
2579         case TCP_SEQ_STATE_ESTABLISHED:
2580                 if (v)
2581                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2582                 break;
2583         }
2584 }
2585
2586 int tcp_seq_open(struct inode *inode, struct file *file)
2587 {
2588         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2589         struct tcp_iter_state *s;
2590         int err;
2591
2592         err = seq_open_net(inode, file, &afinfo->seq_ops,
2593                           sizeof(struct tcp_iter_state));
2594         if (err < 0)
2595                 return err;
2596
2597         s = ((struct seq_file *)file->private_data)->private;
2598         s->family               = afinfo->family;
2599         s->last_pos             = 0;
2600         return 0;
2601 }
2602 EXPORT_SYMBOL(tcp_seq_open);
2603
2604 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2605 {
2606         int rc = 0;
2607         struct proc_dir_entry *p;
2608
2609         afinfo->seq_ops.start           = tcp_seq_start;
2610         afinfo->seq_ops.next            = tcp_seq_next;
2611         afinfo->seq_ops.stop            = tcp_seq_stop;
2612
2613         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2614                              afinfo->seq_fops, afinfo);
2615         if (!p)
2616                 rc = -ENOMEM;
2617         return rc;
2618 }
2619 EXPORT_SYMBOL(tcp_proc_register);
2620
2621 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2622 {
2623         proc_net_remove(net, afinfo->name);
2624 }
2625 EXPORT_SYMBOL(tcp_proc_unregister);
2626
2627 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2628                          struct seq_file *f, int i, kuid_t uid, int *len)
2629 {
2630         const struct inet_request_sock *ireq = inet_rsk(req);
2631         long delta = req->expires - jiffies;
2632
2633         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2634                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2635                 i,
2636                 ireq->loc_addr,
2637                 ntohs(inet_sk(sk)->inet_sport),
2638                 ireq->rmt_addr,
2639                 ntohs(ireq->rmt_port),
2640                 TCP_SYN_RECV,
2641                 0, 0, /* could print option size, but that is af dependent. */
2642                 1,    /* timers active (only the expire timer) */
2643                 jiffies_delta_to_clock_t(delta),
2644                 req->retrans,
2645                 from_kuid_munged(seq_user_ns(f), uid),
2646                 0,  /* non standard timer */
2647                 0, /* open_requests have no inode */
2648                 atomic_read(&sk->sk_refcnt),
2649                 req,
2650                 len);
2651 }
2652
2653 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2654 {
2655         int timer_active;
2656         unsigned long timer_expires;
2657         const struct tcp_sock *tp = tcp_sk(sk);
2658         const struct inet_connection_sock *icsk = inet_csk(sk);
2659         const struct inet_sock *inet = inet_sk(sk);
2660         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2661         __be32 dest = inet->inet_daddr;
2662         __be32 src = inet->inet_rcv_saddr;
2663         __u16 destp = ntohs(inet->inet_dport);
2664         __u16 srcp = ntohs(inet->inet_sport);
2665         int rx_queue;
2666
2667         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2668                 timer_active    = 1;
2669                 timer_expires   = icsk->icsk_timeout;
2670         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2671                 timer_active    = 4;
2672                 timer_expires   = icsk->icsk_timeout;
2673         } else if (timer_pending(&sk->sk_timer)) {
2674                 timer_active    = 2;
2675                 timer_expires   = sk->sk_timer.expires;
2676         } else {
2677                 timer_active    = 0;
2678                 timer_expires = jiffies;
2679         }
2680
2681         if (sk->sk_state == TCP_LISTEN)
2682                 rx_queue = sk->sk_ack_backlog;
2683         else
2684                 /*
2685                  * because we dont lock socket, we might find a transient negative value
2686                  */
2687                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2688
2689         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2690                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2691                 i, src, srcp, dest, destp, sk->sk_state,
2692                 tp->write_seq - tp->snd_una,
2693                 rx_queue,
2694                 timer_active,
2695                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2696                 icsk->icsk_retransmits,
2697                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2698                 icsk->icsk_probes_out,
2699                 sock_i_ino(sk),
2700                 atomic_read(&sk->sk_refcnt), sk,
2701                 jiffies_to_clock_t(icsk->icsk_rto),
2702                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2703                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2704                 tp->snd_cwnd,
2705                 sk->sk_state == TCP_LISTEN ?
2706                     (fastopenq ? fastopenq->max_qlen : 0) :
2707                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2708                 len);
2709 }
2710
2711 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2712                                struct seq_file *f, int i, int *len)
2713 {
2714         __be32 dest, src;
2715         __u16 destp, srcp;
2716         long delta = tw->tw_ttd - jiffies;
2717
2718         dest  = tw->tw_daddr;
2719         src   = tw->tw_rcv_saddr;
2720         destp = ntohs(tw->tw_dport);
2721         srcp  = ntohs(tw->tw_sport);
2722
2723         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2724                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2725                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2726                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2727                 atomic_read(&tw->tw_refcnt), tw, len);
2728 }
2729
2730 #define TMPSZ 150
2731
2732 static int tcp4_seq_show(struct seq_file *seq, void *v)
2733 {
2734         struct tcp_iter_state *st;
2735         int len;
2736
2737         if (v == SEQ_START_TOKEN) {
2738                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2739                            "  sl  local_address rem_address   st tx_queue "
2740                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2741                            "inode");
2742                 goto out;
2743         }
2744         st = seq->private;
2745
2746         switch (st->state) {
2747         case TCP_SEQ_STATE_LISTENING:
2748         case TCP_SEQ_STATE_ESTABLISHED:
2749                 get_tcp4_sock(v, seq, st->num, &len);
2750                 break;
2751         case TCP_SEQ_STATE_OPENREQ:
2752                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2753                 break;
2754         case TCP_SEQ_STATE_TIME_WAIT:
2755                 get_timewait4_sock(v, seq, st->num, &len);
2756                 break;
2757         }
2758         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2759 out:
2760         return 0;
2761 }
2762
2763 static const struct file_operations tcp_afinfo_seq_fops = {
2764         .owner   = THIS_MODULE,
2765         .open    = tcp_seq_open,
2766         .read    = seq_read,
2767         .llseek  = seq_lseek,
2768         .release = seq_release_net
2769 };
2770
2771 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2772         .name           = "tcp",
2773         .family         = AF_INET,
2774         .seq_fops       = &tcp_afinfo_seq_fops,
2775         .seq_ops        = {
2776                 .show           = tcp4_seq_show,
2777         },
2778 };
2779
2780 static int __net_init tcp4_proc_init_net(struct net *net)
2781 {
2782         return tcp_proc_register(net, &tcp4_seq_afinfo);
2783 }
2784
2785 static void __net_exit tcp4_proc_exit_net(struct net *net)
2786 {
2787         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2788 }
2789
2790 static struct pernet_operations tcp4_net_ops = {
2791         .init = tcp4_proc_init_net,
2792         .exit = tcp4_proc_exit_net,
2793 };
2794
2795 int __init tcp4_proc_init(void)
2796 {
2797         return register_pernet_subsys(&tcp4_net_ops);
2798 }
2799
2800 void tcp4_proc_exit(void)
2801 {
2802         unregister_pernet_subsys(&tcp4_net_ops);
2803 }
2804 #endif /* CONFIG_PROC_FS */
2805
2806 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2807 {
2808         const struct iphdr *iph = skb_gro_network_header(skb);
2809
2810         switch (skb->ip_summed) {
2811         case CHECKSUM_COMPLETE:
2812                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2813                                   skb->csum)) {
2814                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2815                         break;
2816                 }
2817
2818                 /* fall through */
2819         case CHECKSUM_NONE:
2820                 NAPI_GRO_CB(skb)->flush = 1;
2821                 return NULL;
2822         }
2823
2824         return tcp_gro_receive(head, skb);
2825 }
2826
2827 int tcp4_gro_complete(struct sk_buff *skb)
2828 {
2829         const struct iphdr *iph = ip_hdr(skb);
2830         struct tcphdr *th = tcp_hdr(skb);
2831
2832         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2833                                   iph->saddr, iph->daddr, 0);
2834         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2835
2836         return tcp_gro_complete(skb);
2837 }
2838
2839 struct proto tcp_prot = {
2840         .name                   = "TCP",
2841         .owner                  = THIS_MODULE,
2842         .close                  = tcp_close,
2843         .connect                = tcp_v4_connect,
2844         .disconnect             = tcp_disconnect,
2845         .accept                 = inet_csk_accept,
2846         .ioctl                  = tcp_ioctl,
2847         .init                   = tcp_v4_init_sock,
2848         .destroy                = tcp_v4_destroy_sock,
2849         .shutdown               = tcp_shutdown,
2850         .setsockopt             = tcp_setsockopt,
2851         .getsockopt             = tcp_getsockopt,
2852         .recvmsg                = tcp_recvmsg,
2853         .sendmsg                = tcp_sendmsg,
2854         .sendpage               = tcp_sendpage,
2855         .backlog_rcv            = tcp_v4_do_rcv,
2856         .release_cb             = tcp_release_cb,
2857         .mtu_reduced            = tcp_v4_mtu_reduced,
2858         .hash                   = inet_hash,
2859         .unhash                 = inet_unhash,
2860         .get_port               = inet_csk_get_port,
2861         .enter_memory_pressure  = tcp_enter_memory_pressure,
2862         .sockets_allocated      = &tcp_sockets_allocated,
2863         .orphan_count           = &tcp_orphan_count,
2864         .memory_allocated       = &tcp_memory_allocated,
2865         .memory_pressure        = &tcp_memory_pressure,
2866         .sysctl_wmem            = sysctl_tcp_wmem,
2867         .sysctl_rmem            = sysctl_tcp_rmem,
2868         .max_header             = MAX_TCP_HEADER,
2869         .obj_size               = sizeof(struct tcp_sock),
2870         .slab_flags             = SLAB_DESTROY_BY_RCU,
2871         .twsk_prot              = &tcp_timewait_sock_ops,
2872         .rsk_prot               = &tcp_request_sock_ops,
2873         .h.hashinfo             = &tcp_hashinfo,
2874         .no_autobind            = true,
2875 #ifdef CONFIG_COMPAT
2876         .compat_setsockopt      = compat_tcp_setsockopt,
2877         .compat_getsockopt      = compat_tcp_getsockopt,
2878 #endif
2879 #ifdef CONFIG_MEMCG_KMEM
2880         .init_cgroup            = tcp_init_cgroup,
2881         .destroy_cgroup         = tcp_destroy_cgroup,
2882         .proto_cgroup           = tcp_proto_cgroup,
2883 #endif
2884 };
2885 EXPORT_SYMBOL(tcp_prot);
2886
2887 static int __net_init tcp_sk_init(struct net *net)
2888 {
2889         return 0;
2890 }
2891
2892 static void __net_exit tcp_sk_exit(struct net *net)
2893 {
2894 }
2895
2896 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2897 {
2898         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2899 }
2900
2901 static struct pernet_operations __net_initdata tcp_sk_ops = {
2902        .init       = tcp_sk_init,
2903        .exit       = tcp_sk_exit,
2904        .exit_batch = tcp_sk_exit_batch,
2905 };
2906
2907 void __init tcp_v4_init(void)
2908 {
2909         inet_hashinfo_init(&tcp_hashinfo);
2910         if (register_pernet_subsys(&tcp_sk_ops))
2911                 panic("Failed to create the TCP control socket.\n");
2912 }