]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/tcp_ipv4.c
Merge remote-tracking branch 'crypto/master'
[karo-tx-linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              sock_owned_by_user(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223
224         sk_set_txhash(sk);
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286
287         mtu = dst_mtu(dst);
288
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         WARN_ON(req->sk);
323
324         if (seq != tcp_rsk(req)->snt_isn) {
325                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326         } else if (abort) {
327                 /*
328                  * Still in SYN_RECV, just remove it silently.
329                  * There is no good way to pass the error to the newly
330                  * created socket, and POSIX does not want network
331                  * errors returned from accept().
332                  */
333                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335         }
336         reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360         struct inet_connection_sock *icsk;
361         struct tcp_sock *tp;
362         struct inet_sock *inet;
363         const int type = icmp_hdr(icmp_skb)->type;
364         const int code = icmp_hdr(icmp_skb)->code;
365         struct sock *sk;
366         struct sk_buff *skb;
367         struct request_sock *fastopen;
368         __u32 seq, snd_una;
369         __u32 remaining;
370         int err;
371         struct net *net = dev_net(icmp_skb->dev);
372
373         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374                                        th->dest, iph->saddr, ntohs(th->source),
375                                        inet_iif(icmp_skb));
376         if (!sk) {
377                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378                 return;
379         }
380         if (sk->sk_state == TCP_TIME_WAIT) {
381                 inet_twsk_put(inet_twsk(sk));
382                 return;
383         }
384         seq = ntohl(th->seq);
385         if (sk->sk_state == TCP_NEW_SYN_RECV)
386                 return tcp_req_err(sk, seq,
387                                   type == ICMP_PARAMETERPROB ||
388                                   type == ICMP_TIME_EXCEEDED ||
389                                   (type == ICMP_DEST_UNREACH &&
390                                    (code == ICMP_NET_UNREACH ||
391                                     code == ICMP_HOST_UNREACH)));
392
393         bh_lock_sock(sk);
394         /* If too many ICMPs get dropped on busy
395          * servers this needs to be solved differently.
396          * We do take care of PMTU discovery (RFC1191) special case :
397          * we can receive locally generated ICMP messages while socket is held.
398          */
399         if (sock_owned_by_user(sk)) {
400                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
401                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
402         }
403         if (sk->sk_state == TCP_CLOSE)
404                 goto out;
405
406         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
407                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
408                 goto out;
409         }
410
411         icsk = inet_csk(sk);
412         tp = tcp_sk(sk);
413         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
414         fastopen = tp->fastopen_rsk;
415         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
416         if (sk->sk_state != TCP_LISTEN &&
417             !between(seq, snd_una, tp->snd_nxt)) {
418                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
419                 goto out;
420         }
421
422         switch (type) {
423         case ICMP_REDIRECT:
424                 do_redirect(icmp_skb, sk);
425                 goto out;
426         case ICMP_SOURCE_QUENCH:
427                 /* Just silently ignore these. */
428                 goto out;
429         case ICMP_PARAMETERPROB:
430                 err = EPROTO;
431                 break;
432         case ICMP_DEST_UNREACH:
433                 if (code > NR_ICMP_UNREACH)
434                         goto out;
435
436                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
437                         /* We are not interested in TCP_LISTEN and open_requests
438                          * (SYN-ACKs send out by Linux are always <576bytes so
439                          * they should go through unfragmented).
440                          */
441                         if (sk->sk_state == TCP_LISTEN)
442                                 goto out;
443
444                         tp->mtu_info = info;
445                         if (!sock_owned_by_user(sk)) {
446                                 tcp_v4_mtu_reduced(sk);
447                         } else {
448                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
449                                         sock_hold(sk);
450                         }
451                         goto out;
452                 }
453
454                 err = icmp_err_convert[code].errno;
455                 /* check if icmp_skb allows revert of backoff
456                  * (see draft-zimmermann-tcp-lcd) */
457                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
458                         break;
459                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
460                     !icsk->icsk_backoff || fastopen)
461                         break;
462
463                 if (sock_owned_by_user(sk))
464                         break;
465
466                 icsk->icsk_backoff--;
467                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
468                                                TCP_TIMEOUT_INIT;
469                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
470
471                 skb = tcp_write_queue_head(sk);
472                 BUG_ON(!skb);
473
474                 remaining = icsk->icsk_rto -
475                             min(icsk->icsk_rto,
476                                 tcp_time_stamp - tcp_skb_timestamp(skb));
477
478                 if (remaining) {
479                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
480                                                   remaining, TCP_RTO_MAX);
481                 } else {
482                         /* RTO revert clocked out retransmission.
483                          * Will retransmit now */
484                         tcp_retransmit_timer(sk);
485                 }
486
487                 break;
488         case ICMP_TIME_EXCEEDED:
489                 err = EHOSTUNREACH;
490                 break;
491         default:
492                 goto out;
493         }
494
495         switch (sk->sk_state) {
496         case TCP_SYN_SENT:
497         case TCP_SYN_RECV:
498                 /* Only in fast or simultaneous open. If a fast open socket is
499                  * is already accepted it is treated as a connected one below.
500                  */
501                 if (fastopen && !fastopen->sk)
502                         break;
503
504                 if (!sock_owned_by_user(sk)) {
505                         sk->sk_err = err;
506
507                         sk->sk_error_report(sk);
508
509                         tcp_done(sk);
510                 } else {
511                         sk->sk_err_soft = err;
512                 }
513                 goto out;
514         }
515
516         /* If we've already connected we will keep trying
517          * until we time out, or the user gives up.
518          *
519          * rfc1122 4.2.3.9 allows to consider as hard errors
520          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
521          * but it is obsoleted by pmtu discovery).
522          *
523          * Note, that in modern internet, where routing is unreliable
524          * and in each dark corner broken firewalls sit, sending random
525          * errors ordered by their masters even this two messages finally lose
526          * their original sense (even Linux sends invalid PORT_UNREACHs)
527          *
528          * Now we are in compliance with RFCs.
529          *                                                      --ANK (980905)
530          */
531
532         inet = inet_sk(sk);
533         if (!sock_owned_by_user(sk) && inet->recverr) {
534                 sk->sk_err = err;
535                 sk->sk_error_report(sk);
536         } else  { /* Only an error on timeout */
537                 sk->sk_err_soft = err;
538         }
539
540 out:
541         bh_unlock_sock(sk);
542         sock_put(sk);
543 }
544
545 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
546 {
547         struct tcphdr *th = tcp_hdr(skb);
548
549         if (skb->ip_summed == CHECKSUM_PARTIAL) {
550                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
551                 skb->csum_start = skb_transport_header(skb) - skb->head;
552                 skb->csum_offset = offsetof(struct tcphdr, check);
553         } else {
554                 th->check = tcp_v4_check(skb->len, saddr, daddr,
555                                          csum_partial(th,
556                                                       th->doff << 2,
557                                                       skb->csum));
558         }
559 }
560
561 /* This routine computes an IPv4 TCP checksum. */
562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
563 {
564         const struct inet_sock *inet = inet_sk(sk);
565
566         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
567 }
568 EXPORT_SYMBOL(tcp_v4_send_check);
569
570 /*
571  *      This routine will send an RST to the other tcp.
572  *
573  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
574  *                    for reset.
575  *      Answer: if a packet caused RST, it is not for a socket
576  *              existing in our system, if it is matched to a socket,
577  *              it is just duplicate segment or bug in other side's TCP.
578  *              So that we build reply only basing on parameters
579  *              arrived with segment.
580  *      Exception: precedence violation. We do not implement it in any case.
581  */
582
583 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
584 {
585         const struct tcphdr *th = tcp_hdr(skb);
586         struct {
587                 struct tcphdr th;
588 #ifdef CONFIG_TCP_MD5SIG
589                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
590 #endif
591         } rep;
592         struct ip_reply_arg arg;
593 #ifdef CONFIG_TCP_MD5SIG
594         struct tcp_md5sig_key *key = NULL;
595         const __u8 *hash_location = NULL;
596         unsigned char newhash[16];
597         int genhash;
598         struct sock *sk1 = NULL;
599 #endif
600         struct net *net;
601
602         /* Never send a reset in response to a reset. */
603         if (th->rst)
604                 return;
605
606         /* If sk not NULL, it means we did a successful lookup and incoming
607          * route had to be correct. prequeue might have dropped our dst.
608          */
609         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
610                 return;
611
612         /* Swap the send and the receive. */
613         memset(&rep, 0, sizeof(rep));
614         rep.th.dest   = th->source;
615         rep.th.source = th->dest;
616         rep.th.doff   = sizeof(struct tcphdr) / 4;
617         rep.th.rst    = 1;
618
619         if (th->ack) {
620                 rep.th.seq = th->ack_seq;
621         } else {
622                 rep.th.ack = 1;
623                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624                                        skb->len - (th->doff << 2));
625         }
626
627         memset(&arg, 0, sizeof(arg));
628         arg.iov[0].iov_base = (unsigned char *)&rep;
629         arg.iov[0].iov_len  = sizeof(rep.th);
630
631         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
632 #ifdef CONFIG_TCP_MD5SIG
633         hash_location = tcp_parse_md5sig_option(th);
634         if (sk && sk_fullsock(sk)) {
635                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
636                                         &ip_hdr(skb)->saddr, AF_INET);
637         } else if (hash_location) {
638                 /*
639                  * active side is lost. Try to find listening socket through
640                  * source port, and then find md5 key through listening socket.
641                  * we are not loose security here:
642                  * Incoming packet is checked with md5 hash with finding key,
643                  * no RST generated if md5 hash doesn't match.
644                  */
645                 sk1 = __inet_lookup_listener(net,
646                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
647                                              th->source, ip_hdr(skb)->daddr,
648                                              ntohs(th->source), inet_iif(skb));
649                 /* don't send rst if it can't find key */
650                 if (!sk1)
651                         return;
652                 rcu_read_lock();
653                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
654                                         &ip_hdr(skb)->saddr, AF_INET);
655                 if (!key)
656                         goto release_sk1;
657
658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
660                         goto release_sk1;
661         }
662
663         if (key) {
664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665                                    (TCPOPT_NOP << 16) |
666                                    (TCPOPT_MD5SIG << 8) |
667                                    TCPOLEN_MD5SIG);
668                 /* Update length and the length the header thinks exists */
669                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670                 rep.th.doff = arg.iov[0].iov_len / 4;
671
672                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673                                      key, ip_hdr(skb)->saddr,
674                                      ip_hdr(skb)->daddr, &rep.th);
675         }
676 #endif
677         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678                                       ip_hdr(skb)->saddr, /* XXX */
679                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
680         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682
683         /* When socket is gone, all binding information is lost.
684          * routing might fail in this case. No choice here, if we choose to force
685          * input interface, we will misroute in case of asymmetric route.
686          */
687         if (sk)
688                 arg.bound_dev_if = sk->sk_bound_dev_if;
689
690         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
691                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
692
693         arg.tos = ip_hdr(skb)->tos;
694         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
695                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
696                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
697                               &arg, arg.iov[0].iov_len);
698
699         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
700         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
701
702 #ifdef CONFIG_TCP_MD5SIG
703 release_sk1:
704         if (sk1) {
705                 rcu_read_unlock();
706                 sock_put(sk1);
707         }
708 #endif
709 }
710
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714
715 static void tcp_v4_send_ack(struct net *net,
716                             struct sk_buff *skb, u32 seq, u32 ack,
717                             u32 win, u32 tsval, u32 tsecr, int oif,
718                             struct tcp_md5sig_key *key,
719                             int reply_flags, u8 tos)
720 {
721         const struct tcphdr *th = tcp_hdr(skb);
722         struct {
723                 struct tcphdr th;
724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728                         ];
729         } rep;
730         struct ip_reply_arg arg;
731
732         memset(&rep.th, 0, sizeof(struct tcphdr));
733         memset(&arg, 0, sizeof(arg));
734
735         arg.iov[0].iov_base = (unsigned char *)&rep;
736         arg.iov[0].iov_len  = sizeof(rep.th);
737         if (tsecr) {
738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739                                    (TCPOPT_TIMESTAMP << 8) |
740                                    TCPOLEN_TIMESTAMP);
741                 rep.opt[1] = htonl(tsval);
742                 rep.opt[2] = htonl(tsecr);
743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744         }
745
746         /* Swap the send and the receive. */
747         rep.th.dest    = th->source;
748         rep.th.source  = th->dest;
749         rep.th.doff    = arg.iov[0].iov_len / 4;
750         rep.th.seq     = htonl(seq);
751         rep.th.ack_seq = htonl(ack);
752         rep.th.ack     = 1;
753         rep.th.window  = htons(win);
754
755 #ifdef CONFIG_TCP_MD5SIG
756         if (key) {
757                 int offset = (tsecr) ? 3 : 0;
758
759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760                                           (TCPOPT_NOP << 16) |
761                                           (TCPOPT_MD5SIG << 8) |
762                                           TCPOLEN_MD5SIG);
763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764                 rep.th.doff = arg.iov[0].iov_len/4;
765
766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767                                     key, ip_hdr(skb)->saddr,
768                                     ip_hdr(skb)->daddr, &rep.th);
769         }
770 #endif
771         arg.flags = reply_flags;
772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773                                       ip_hdr(skb)->saddr, /* XXX */
774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776         if (oif)
777                 arg.bound_dev_if = oif;
778         arg.tos = tos;
779         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
780                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
781                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
782                               &arg, arg.iov[0].iov_len);
783
784         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
785 }
786
787 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
788 {
789         struct inet_timewait_sock *tw = inet_twsk(sk);
790         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
791
792         tcp_v4_send_ack(sock_net(sk), skb,
793                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
794                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
795                         tcp_time_stamp + tcptw->tw_ts_offset,
796                         tcptw->tw_ts_recent,
797                         tw->tw_bound_dev_if,
798                         tcp_twsk_md5_key(tcptw),
799                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
800                         tw->tw_tos
801                         );
802
803         inet_twsk_put(tw);
804 }
805
806 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
807                                   struct request_sock *req)
808 {
809         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
810          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
811          */
812         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
813                                              tcp_sk(sk)->snd_nxt;
814
815         tcp_v4_send_ack(sock_net(sk), skb, seq,
816                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
817                         tcp_time_stamp,
818                         req->ts_recent,
819                         0,
820                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
821                                           AF_INET),
822                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
823                         ip_hdr(skb)->tos);
824 }
825
826 /*
827  *      Send a SYN-ACK after having received a SYN.
828  *      This still operates on a request_sock only, not on a big
829  *      socket.
830  */
831 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
832                               struct flowi *fl,
833                               struct request_sock *req,
834                               struct tcp_fastopen_cookie *foc,
835                                   bool attach_req)
836 {
837         const struct inet_request_sock *ireq = inet_rsk(req);
838         struct flowi4 fl4;
839         int err = -1;
840         struct sk_buff *skb;
841
842         /* First, grab a route. */
843         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
844                 return -1;
845
846         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
847
848         if (skb) {
849                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
850
851                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
852                                             ireq->ir_rmt_addr,
853                                             ireq->opt);
854                 err = net_xmit_eval(err);
855         }
856
857         return err;
858 }
859
860 /*
861  *      IPv4 request_sock destructor.
862  */
863 static void tcp_v4_reqsk_destructor(struct request_sock *req)
864 {
865         kfree(inet_rsk(req)->opt);
866 }
867
868 #ifdef CONFIG_TCP_MD5SIG
869 /*
870  * RFC2385 MD5 checksumming requires a mapping of
871  * IP address->MD5 Key.
872  * We need to maintain these in the sk structure.
873  */
874
875 /* Find the Key structure for an address.  */
876 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
877                                          const union tcp_md5_addr *addr,
878                                          int family)
879 {
880         const struct tcp_sock *tp = tcp_sk(sk);
881         struct tcp_md5sig_key *key;
882         unsigned int size = sizeof(struct in_addr);
883         const struct tcp_md5sig_info *md5sig;
884
885         /* caller either holds rcu_read_lock() or socket lock */
886         md5sig = rcu_dereference_check(tp->md5sig_info,
887                                        sock_owned_by_user(sk) ||
888                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
889         if (!md5sig)
890                 return NULL;
891 #if IS_ENABLED(CONFIG_IPV6)
892         if (family == AF_INET6)
893                 size = sizeof(struct in6_addr);
894 #endif
895         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
896                 if (key->family != family)
897                         continue;
898                 if (!memcmp(&key->addr, addr, size))
899                         return key;
900         }
901         return NULL;
902 }
903 EXPORT_SYMBOL(tcp_md5_do_lookup);
904
905 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
906                                          const struct sock *addr_sk)
907 {
908         const union tcp_md5_addr *addr;
909
910         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
911         return tcp_md5_do_lookup(sk, addr, AF_INET);
912 }
913 EXPORT_SYMBOL(tcp_v4_md5_lookup);
914
915 /* This can be called on a newly created socket, from other files */
916 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
917                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
918 {
919         /* Add Key to the list */
920         struct tcp_md5sig_key *key;
921         struct tcp_sock *tp = tcp_sk(sk);
922         struct tcp_md5sig_info *md5sig;
923
924         key = tcp_md5_do_lookup(sk, addr, family);
925         if (key) {
926                 /* Pre-existing entry - just update that one. */
927                 memcpy(key->key, newkey, newkeylen);
928                 key->keylen = newkeylen;
929                 return 0;
930         }
931
932         md5sig = rcu_dereference_protected(tp->md5sig_info,
933                                            sock_owned_by_user(sk) ||
934                                            lockdep_is_held(&sk->sk_lock.slock));
935         if (!md5sig) {
936                 md5sig = kmalloc(sizeof(*md5sig), gfp);
937                 if (!md5sig)
938                         return -ENOMEM;
939
940                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
941                 INIT_HLIST_HEAD(&md5sig->head);
942                 rcu_assign_pointer(tp->md5sig_info, md5sig);
943         }
944
945         key = sock_kmalloc(sk, sizeof(*key), gfp);
946         if (!key)
947                 return -ENOMEM;
948         if (!tcp_alloc_md5sig_pool()) {
949                 sock_kfree_s(sk, key, sizeof(*key));
950                 return -ENOMEM;
951         }
952
953         memcpy(key->key, newkey, newkeylen);
954         key->keylen = newkeylen;
955         key->family = family;
956         memcpy(&key->addr, addr,
957                (family == AF_INET6) ? sizeof(struct in6_addr) :
958                                       sizeof(struct in_addr));
959         hlist_add_head_rcu(&key->node, &md5sig->head);
960         return 0;
961 }
962 EXPORT_SYMBOL(tcp_md5_do_add);
963
964 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
965 {
966         struct tcp_md5sig_key *key;
967
968         key = tcp_md5_do_lookup(sk, addr, family);
969         if (!key)
970                 return -ENOENT;
971         hlist_del_rcu(&key->node);
972         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
973         kfree_rcu(key, rcu);
974         return 0;
975 }
976 EXPORT_SYMBOL(tcp_md5_do_del);
977
978 static void tcp_clear_md5_list(struct sock *sk)
979 {
980         struct tcp_sock *tp = tcp_sk(sk);
981         struct tcp_md5sig_key *key;
982         struct hlist_node *n;
983         struct tcp_md5sig_info *md5sig;
984
985         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
986
987         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
988                 hlist_del_rcu(&key->node);
989                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
990                 kfree_rcu(key, rcu);
991         }
992 }
993
994 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
995                                  int optlen)
996 {
997         struct tcp_md5sig cmd;
998         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
999
1000         if (optlen < sizeof(cmd))
1001                 return -EINVAL;
1002
1003         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1004                 return -EFAULT;
1005
1006         if (sin->sin_family != AF_INET)
1007                 return -EINVAL;
1008
1009         if (!cmd.tcpm_keylen)
1010                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1011                                       AF_INET);
1012
1013         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1014                 return -EINVAL;
1015
1016         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1017                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1018                               GFP_KERNEL);
1019 }
1020
1021 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1022                                         __be32 daddr, __be32 saddr, int nbytes)
1023 {
1024         struct tcp4_pseudohdr *bp;
1025         struct scatterlist sg;
1026
1027         bp = &hp->md5_blk.ip4;
1028
1029         /*
1030          * 1. the TCP pseudo-header (in the order: source IP address,
1031          * destination IP address, zero-padded protocol number, and
1032          * segment length)
1033          */
1034         bp->saddr = saddr;
1035         bp->daddr = daddr;
1036         bp->pad = 0;
1037         bp->protocol = IPPROTO_TCP;
1038         bp->len = cpu_to_be16(nbytes);
1039
1040         sg_init_one(&sg, bp, sizeof(*bp));
1041         ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
1042         return crypto_ahash_update(hp->md5_req);
1043 }
1044
1045 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047 {
1048         struct tcp_md5sig_pool *hp;
1049         struct ahash_request *req;
1050
1051         hp = tcp_get_md5sig_pool();
1052         if (!hp)
1053                 goto clear_hash_noput;
1054         req = hp->md5_req;
1055
1056         if (crypto_ahash_init(req))
1057                 goto clear_hash;
1058         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059                 goto clear_hash;
1060         if (tcp_md5_hash_header(hp, th))
1061                 goto clear_hash;
1062         if (tcp_md5_hash_key(hp, key))
1063                 goto clear_hash;
1064         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1065         if (crypto_ahash_final(req))
1066                 goto clear_hash;
1067
1068         tcp_put_md5sig_pool();
1069         return 0;
1070
1071 clear_hash:
1072         tcp_put_md5sig_pool();
1073 clear_hash_noput:
1074         memset(md5_hash, 0, 16);
1075         return 1;
1076 }
1077
1078 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1079                         const struct sock *sk,
1080                         const struct sk_buff *skb)
1081 {
1082         struct tcp_md5sig_pool *hp;
1083         struct ahash_request *req;
1084         const struct tcphdr *th = tcp_hdr(skb);
1085         __be32 saddr, daddr;
1086
1087         if (sk) { /* valid for establish/request sockets */
1088                 saddr = sk->sk_rcv_saddr;
1089                 daddr = sk->sk_daddr;
1090         } else {
1091                 const struct iphdr *iph = ip_hdr(skb);
1092                 saddr = iph->saddr;
1093                 daddr = iph->daddr;
1094         }
1095
1096         hp = tcp_get_md5sig_pool();
1097         if (!hp)
1098                 goto clear_hash_noput;
1099         req = hp->md5_req;
1100
1101         if (crypto_ahash_init(req))
1102                 goto clear_hash;
1103
1104         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1105                 goto clear_hash;
1106         if (tcp_md5_hash_header(hp, th))
1107                 goto clear_hash;
1108         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1109                 goto clear_hash;
1110         if (tcp_md5_hash_key(hp, key))
1111                 goto clear_hash;
1112         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1113         if (crypto_ahash_final(req))
1114                 goto clear_hash;
1115
1116         tcp_put_md5sig_pool();
1117         return 0;
1118
1119 clear_hash:
1120         tcp_put_md5sig_pool();
1121 clear_hash_noput:
1122         memset(md5_hash, 0, 16);
1123         return 1;
1124 }
1125 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1126
1127 #endif
1128
1129 /* Called with rcu_read_lock() */
1130 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1131                                     const struct sk_buff *skb)
1132 {
1133 #ifdef CONFIG_TCP_MD5SIG
1134         /*
1135          * This gets called for each TCP segment that arrives
1136          * so we want to be efficient.
1137          * We have 3 drop cases:
1138          * o No MD5 hash and one expected.
1139          * o MD5 hash and we're not expecting one.
1140          * o MD5 hash and its wrong.
1141          */
1142         const __u8 *hash_location = NULL;
1143         struct tcp_md5sig_key *hash_expected;
1144         const struct iphdr *iph = ip_hdr(skb);
1145         const struct tcphdr *th = tcp_hdr(skb);
1146         int genhash;
1147         unsigned char newhash[16];
1148
1149         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1150                                           AF_INET);
1151         hash_location = tcp_parse_md5sig_option(th);
1152
1153         /* We've parsed the options - do we have a hash? */
1154         if (!hash_expected && !hash_location)
1155                 return false;
1156
1157         if (hash_expected && !hash_location) {
1158                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1159                 return true;
1160         }
1161
1162         if (!hash_expected && hash_location) {
1163                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1164                 return true;
1165         }
1166
1167         /* Okay, so this is hash_expected and hash_location -
1168          * so we need to calculate the checksum.
1169          */
1170         genhash = tcp_v4_md5_hash_skb(newhash,
1171                                       hash_expected,
1172                                       NULL, skb);
1173
1174         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1175                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1176                                      &iph->saddr, ntohs(th->source),
1177                                      &iph->daddr, ntohs(th->dest),
1178                                      genhash ? " tcp_v4_calc_md5_hash failed"
1179                                      : "");
1180                 return true;
1181         }
1182         return false;
1183 #endif
1184         return false;
1185 }
1186
1187 static void tcp_v4_init_req(struct request_sock *req,
1188                             const struct sock *sk_listener,
1189                             struct sk_buff *skb)
1190 {
1191         struct inet_request_sock *ireq = inet_rsk(req);
1192
1193         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1194         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1195         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1196         ireq->opt = tcp_v4_save_options(skb);
1197 }
1198
1199 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1200                                           struct flowi *fl,
1201                                           const struct request_sock *req,
1202                                           bool *strict)
1203 {
1204         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1205
1206         if (strict) {
1207                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1208                         *strict = true;
1209                 else
1210                         *strict = false;
1211         }
1212
1213         return dst;
1214 }
1215
1216 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1217         .family         =       PF_INET,
1218         .obj_size       =       sizeof(struct tcp_request_sock),
1219         .rtx_syn_ack    =       tcp_rtx_synack,
1220         .send_ack       =       tcp_v4_reqsk_send_ack,
1221         .destructor     =       tcp_v4_reqsk_destructor,
1222         .send_reset     =       tcp_v4_send_reset,
1223         .syn_ack_timeout =      tcp_syn_ack_timeout,
1224 };
1225
1226 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1227         .mss_clamp      =       TCP_MSS_DEFAULT,
1228 #ifdef CONFIG_TCP_MD5SIG
1229         .req_md5_lookup =       tcp_v4_md5_lookup,
1230         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1231 #endif
1232         .init_req       =       tcp_v4_init_req,
1233 #ifdef CONFIG_SYN_COOKIES
1234         .cookie_init_seq =      cookie_v4_init_sequence,
1235 #endif
1236         .route_req      =       tcp_v4_route_req,
1237         .init_seq       =       tcp_v4_init_sequence,
1238         .send_synack    =       tcp_v4_send_synack,
1239 };
1240
1241 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1242 {
1243         /* Never answer to SYNs send to broadcast or multicast */
1244         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1245                 goto drop;
1246
1247         return tcp_conn_request(&tcp_request_sock_ops,
1248                                 &tcp_request_sock_ipv4_ops, sk, skb);
1249
1250 drop:
1251         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1252         return 0;
1253 }
1254 EXPORT_SYMBOL(tcp_v4_conn_request);
1255
1256
1257 /*
1258  * The three way handshake has completed - we got a valid synack -
1259  * now create the new socket.
1260  */
1261 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1262                                   struct request_sock *req,
1263                                   struct dst_entry *dst,
1264                                   struct request_sock *req_unhash,
1265                                   bool *own_req)
1266 {
1267         struct inet_request_sock *ireq;
1268         struct inet_sock *newinet;
1269         struct tcp_sock *newtp;
1270         struct sock *newsk;
1271 #ifdef CONFIG_TCP_MD5SIG
1272         struct tcp_md5sig_key *key;
1273 #endif
1274         struct ip_options_rcu *inet_opt;
1275
1276         if (sk_acceptq_is_full(sk))
1277                 goto exit_overflow;
1278
1279         newsk = tcp_create_openreq_child(sk, req, skb);
1280         if (!newsk)
1281                 goto exit_nonewsk;
1282
1283         newsk->sk_gso_type = SKB_GSO_TCPV4;
1284         inet_sk_rx_dst_set(newsk, skb);
1285
1286         newtp                 = tcp_sk(newsk);
1287         newinet               = inet_sk(newsk);
1288         ireq                  = inet_rsk(req);
1289         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1290         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1291         newsk->sk_bound_dev_if = ireq->ir_iif;
1292         newinet->inet_saddr           = ireq->ir_loc_addr;
1293         inet_opt              = ireq->opt;
1294         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1295         ireq->opt             = NULL;
1296         newinet->mc_index     = inet_iif(skb);
1297         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1298         newinet->rcv_tos      = ip_hdr(skb)->tos;
1299         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1300         if (inet_opt)
1301                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1302         newinet->inet_id = newtp->write_seq ^ jiffies;
1303
1304         if (!dst) {
1305                 dst = inet_csk_route_child_sock(sk, newsk, req);
1306                 if (!dst)
1307                         goto put_and_exit;
1308         } else {
1309                 /* syncookie case : see end of cookie_v4_check() */
1310         }
1311         sk_setup_caps(newsk, dst);
1312
1313         tcp_ca_openreq_child(newsk, dst);
1314
1315         tcp_sync_mss(newsk, dst_mtu(dst));
1316         newtp->advmss = dst_metric_advmss(dst);
1317         if (tcp_sk(sk)->rx_opt.user_mss &&
1318             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1319                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1320
1321         tcp_initialize_rcv_mss(newsk);
1322
1323 #ifdef CONFIG_TCP_MD5SIG
1324         /* Copy over the MD5 key from the original socket */
1325         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1326                                 AF_INET);
1327         if (key) {
1328                 /*
1329                  * We're using one, so create a matching key
1330                  * on the newsk structure. If we fail to get
1331                  * memory, then we end up not copying the key
1332                  * across. Shucks.
1333                  */
1334                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1335                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1336                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1337         }
1338 #endif
1339
1340         if (__inet_inherit_port(sk, newsk) < 0)
1341                 goto put_and_exit;
1342         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1343         if (*own_req)
1344                 tcp_move_syn(newtp, req);
1345
1346         return newsk;
1347
1348 exit_overflow:
1349         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1350 exit_nonewsk:
1351         dst_release(dst);
1352 exit:
1353         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1354         return NULL;
1355 put_and_exit:
1356         inet_csk_prepare_forced_close(newsk);
1357         tcp_done(newsk);
1358         goto exit;
1359 }
1360 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1361
1362 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1363 {
1364 #ifdef CONFIG_SYN_COOKIES
1365         const struct tcphdr *th = tcp_hdr(skb);
1366
1367         if (!th->syn)
1368                 sk = cookie_v4_check(sk, skb);
1369 #endif
1370         return sk;
1371 }
1372
1373 /* The socket must have it's spinlock held when we get
1374  * here, unless it is a TCP_LISTEN socket.
1375  *
1376  * We have a potential double-lock case here, so even when
1377  * doing backlog processing we use the BH locking scheme.
1378  * This is because we cannot sleep with the original spinlock
1379  * held.
1380  */
1381 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1382 {
1383         struct sock *rsk;
1384
1385         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1386                 struct dst_entry *dst = sk->sk_rx_dst;
1387
1388                 sock_rps_save_rxhash(sk, skb);
1389                 sk_mark_napi_id(sk, skb);
1390                 if (dst) {
1391                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1392                             !dst->ops->check(dst, 0)) {
1393                                 dst_release(dst);
1394                                 sk->sk_rx_dst = NULL;
1395                         }
1396                 }
1397                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1398                 return 0;
1399         }
1400
1401         if (tcp_checksum_complete(skb))
1402                 goto csum_err;
1403
1404         if (sk->sk_state == TCP_LISTEN) {
1405                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1406
1407                 if (!nsk)
1408                         goto discard;
1409                 if (nsk != sk) {
1410                         sock_rps_save_rxhash(nsk, skb);
1411                         sk_mark_napi_id(nsk, skb);
1412                         if (tcp_child_process(sk, nsk, skb)) {
1413                                 rsk = nsk;
1414                                 goto reset;
1415                         }
1416                         return 0;
1417                 }
1418         } else
1419                 sock_rps_save_rxhash(sk, skb);
1420
1421         if (tcp_rcv_state_process(sk, skb)) {
1422                 rsk = sk;
1423                 goto reset;
1424         }
1425         return 0;
1426
1427 reset:
1428         tcp_v4_send_reset(rsk, skb);
1429 discard:
1430         kfree_skb(skb);
1431         /* Be careful here. If this function gets more complicated and
1432          * gcc suffers from register pressure on the x86, sk (in %ebx)
1433          * might be destroyed here. This current version compiles correctly,
1434          * but you have been warned.
1435          */
1436         return 0;
1437
1438 csum_err:
1439         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1440         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1441         goto discard;
1442 }
1443 EXPORT_SYMBOL(tcp_v4_do_rcv);
1444
1445 void tcp_v4_early_demux(struct sk_buff *skb)
1446 {
1447         const struct iphdr *iph;
1448         const struct tcphdr *th;
1449         struct sock *sk;
1450
1451         if (skb->pkt_type != PACKET_HOST)
1452                 return;
1453
1454         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1455                 return;
1456
1457         iph = ip_hdr(skb);
1458         th = tcp_hdr(skb);
1459
1460         if (th->doff < sizeof(struct tcphdr) / 4)
1461                 return;
1462
1463         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1464                                        iph->saddr, th->source,
1465                                        iph->daddr, ntohs(th->dest),
1466                                        skb->skb_iif);
1467         if (sk) {
1468                 skb->sk = sk;
1469                 skb->destructor = sock_edemux;
1470                 if (sk_fullsock(sk)) {
1471                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1472
1473                         if (dst)
1474                                 dst = dst_check(dst, 0);
1475                         if (dst &&
1476                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1477                                 skb_dst_set_noref(skb, dst);
1478                 }
1479         }
1480 }
1481
1482 /* Packet is added to VJ-style prequeue for processing in process
1483  * context, if a reader task is waiting. Apparently, this exciting
1484  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1485  * failed somewhere. Latency? Burstiness? Well, at least now we will
1486  * see, why it failed. 8)8)                               --ANK
1487  *
1488  */
1489 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1490 {
1491         struct tcp_sock *tp = tcp_sk(sk);
1492
1493         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1494                 return false;
1495
1496         if (skb->len <= tcp_hdrlen(skb) &&
1497             skb_queue_len(&tp->ucopy.prequeue) == 0)
1498                 return false;
1499
1500         /* Before escaping RCU protected region, we need to take care of skb
1501          * dst. Prequeue is only enabled for established sockets.
1502          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1503          * Instead of doing full sk_rx_dst validity here, let's perform
1504          * an optimistic check.
1505          */
1506         if (likely(sk->sk_rx_dst))
1507                 skb_dst_drop(skb);
1508         else
1509                 skb_dst_force_safe(skb);
1510
1511         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1512         tp->ucopy.memory += skb->truesize;
1513         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1514                 struct sk_buff *skb1;
1515
1516                 BUG_ON(sock_owned_by_user(sk));
1517
1518                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1519                         sk_backlog_rcv(sk, skb1);
1520                         NET_INC_STATS_BH(sock_net(sk),
1521                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1522                 }
1523
1524                 tp->ucopy.memory = 0;
1525         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1526                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1527                                            POLLIN | POLLRDNORM | POLLRDBAND);
1528                 if (!inet_csk_ack_scheduled(sk))
1529                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1530                                                   (3 * tcp_rto_min(sk)) / 4,
1531                                                   TCP_RTO_MAX);
1532         }
1533         return true;
1534 }
1535 EXPORT_SYMBOL(tcp_prequeue);
1536
1537 /*
1538  *      From tcp_input.c
1539  */
1540
1541 int tcp_v4_rcv(struct sk_buff *skb)
1542 {
1543         const struct iphdr *iph;
1544         const struct tcphdr *th;
1545         struct sock *sk;
1546         int ret;
1547         struct net *net = dev_net(skb->dev);
1548
1549         if (skb->pkt_type != PACKET_HOST)
1550                 goto discard_it;
1551
1552         /* Count it even if it's bad */
1553         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1554
1555         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1556                 goto discard_it;
1557
1558         th = tcp_hdr(skb);
1559
1560         if (th->doff < sizeof(struct tcphdr) / 4)
1561                 goto bad_packet;
1562         if (!pskb_may_pull(skb, th->doff * 4))
1563                 goto discard_it;
1564
1565         /* An explanation is required here, I think.
1566          * Packet length and doff are validated by header prediction,
1567          * provided case of th->doff==0 is eliminated.
1568          * So, we defer the checks. */
1569
1570         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1571                 goto csum_error;
1572
1573         th = tcp_hdr(skb);
1574         iph = ip_hdr(skb);
1575         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1576          * barrier() makes sure compiler wont play fool^Waliasing games.
1577          */
1578         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1579                 sizeof(struct inet_skb_parm));
1580         barrier();
1581
1582         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1583         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1584                                     skb->len - th->doff * 4);
1585         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1586         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1587         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1588         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1589         TCP_SKB_CB(skb)->sacked  = 0;
1590
1591 lookup:
1592         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1593         if (!sk)
1594                 goto no_tcp_socket;
1595
1596 process:
1597         if (sk->sk_state == TCP_TIME_WAIT)
1598                 goto do_time_wait;
1599
1600         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1601                 struct request_sock *req = inet_reqsk(sk);
1602                 struct sock *nsk = NULL;
1603
1604                 sk = req->rsk_listener;
1605                 if (tcp_v4_inbound_md5_hash(sk, skb))
1606                         goto discard_and_relse;
1607                 if (likely(sk->sk_state == TCP_LISTEN)) {
1608                         nsk = tcp_check_req(sk, skb, req, false);
1609                 } else {
1610                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1611                         goto lookup;
1612                 }
1613                 if (!nsk) {
1614                         reqsk_put(req);
1615                         goto discard_it;
1616                 }
1617                 if (nsk == sk) {
1618                         sock_hold(sk);
1619                         reqsk_put(req);
1620                 } else if (tcp_child_process(sk, nsk, skb)) {
1621                         tcp_v4_send_reset(nsk, skb);
1622                         goto discard_it;
1623                 } else {
1624                         return 0;
1625                 }
1626         }
1627         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1628                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1629                 goto discard_and_relse;
1630         }
1631
1632         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1633                 goto discard_and_relse;
1634
1635         if (tcp_v4_inbound_md5_hash(sk, skb))
1636                 goto discard_and_relse;
1637
1638         nf_reset(skb);
1639
1640         if (sk_filter(sk, skb))
1641                 goto discard_and_relse;
1642
1643         skb->dev = NULL;
1644
1645         if (sk->sk_state == TCP_LISTEN) {
1646                 ret = tcp_v4_do_rcv(sk, skb);
1647                 goto put_and_return;
1648         }
1649
1650         sk_incoming_cpu_update(sk);
1651
1652         bh_lock_sock_nested(sk);
1653         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1654         ret = 0;
1655         if (!sock_owned_by_user(sk)) {
1656                 if (!tcp_prequeue(sk, skb))
1657                         ret = tcp_v4_do_rcv(sk, skb);
1658         } else if (unlikely(sk_add_backlog(sk, skb,
1659                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1660                 bh_unlock_sock(sk);
1661                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1662                 goto discard_and_relse;
1663         }
1664         bh_unlock_sock(sk);
1665
1666 put_and_return:
1667         sock_put(sk);
1668
1669         return ret;
1670
1671 no_tcp_socket:
1672         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1673                 goto discard_it;
1674
1675         if (tcp_checksum_complete(skb)) {
1676 csum_error:
1677                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1678 bad_packet:
1679                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1680         } else {
1681                 tcp_v4_send_reset(NULL, skb);
1682         }
1683
1684 discard_it:
1685         /* Discard frame. */
1686         kfree_skb(skb);
1687         return 0;
1688
1689 discard_and_relse:
1690         sock_put(sk);
1691         goto discard_it;
1692
1693 do_time_wait:
1694         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1695                 inet_twsk_put(inet_twsk(sk));
1696                 goto discard_it;
1697         }
1698
1699         if (tcp_checksum_complete(skb)) {
1700                 inet_twsk_put(inet_twsk(sk));
1701                 goto csum_error;
1702         }
1703         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1704         case TCP_TW_SYN: {
1705                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1706                                                         &tcp_hashinfo,
1707                                                         iph->saddr, th->source,
1708                                                         iph->daddr, th->dest,
1709                                                         inet_iif(skb));
1710                 if (sk2) {
1711                         inet_twsk_deschedule_put(inet_twsk(sk));
1712                         sk = sk2;
1713                         goto process;
1714                 }
1715                 /* Fall through to ACK */
1716         }
1717         case TCP_TW_ACK:
1718                 tcp_v4_timewait_ack(sk, skb);
1719                 break;
1720         case TCP_TW_RST:
1721                 tcp_v4_send_reset(sk, skb);
1722                 inet_twsk_deschedule_put(inet_twsk(sk));
1723                 goto discard_it;
1724         case TCP_TW_SUCCESS:;
1725         }
1726         goto discard_it;
1727 }
1728
1729 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1730         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1731         .twsk_unique    = tcp_twsk_unique,
1732         .twsk_destructor= tcp_twsk_destructor,
1733 };
1734
1735 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1736 {
1737         struct dst_entry *dst = skb_dst(skb);
1738
1739         if (dst && dst_hold_safe(dst)) {
1740                 sk->sk_rx_dst = dst;
1741                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1742         }
1743 }
1744 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1745
1746 const struct inet_connection_sock_af_ops ipv4_specific = {
1747         .queue_xmit        = ip_queue_xmit,
1748         .send_check        = tcp_v4_send_check,
1749         .rebuild_header    = inet_sk_rebuild_header,
1750         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1751         .conn_request      = tcp_v4_conn_request,
1752         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1753         .net_header_len    = sizeof(struct iphdr),
1754         .setsockopt        = ip_setsockopt,
1755         .getsockopt        = ip_getsockopt,
1756         .addr2sockaddr     = inet_csk_addr2sockaddr,
1757         .sockaddr_len      = sizeof(struct sockaddr_in),
1758         .bind_conflict     = inet_csk_bind_conflict,
1759 #ifdef CONFIG_COMPAT
1760         .compat_setsockopt = compat_ip_setsockopt,
1761         .compat_getsockopt = compat_ip_getsockopt,
1762 #endif
1763         .mtu_reduced       = tcp_v4_mtu_reduced,
1764 };
1765 EXPORT_SYMBOL(ipv4_specific);
1766
1767 #ifdef CONFIG_TCP_MD5SIG
1768 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1769         .md5_lookup             = tcp_v4_md5_lookup,
1770         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1771         .md5_parse              = tcp_v4_parse_md5_keys,
1772 };
1773 #endif
1774
1775 /* NOTE: A lot of things set to zero explicitly by call to
1776  *       sk_alloc() so need not be done here.
1777  */
1778 static int tcp_v4_init_sock(struct sock *sk)
1779 {
1780         struct inet_connection_sock *icsk = inet_csk(sk);
1781
1782         tcp_init_sock(sk);
1783
1784         icsk->icsk_af_ops = &ipv4_specific;
1785
1786 #ifdef CONFIG_TCP_MD5SIG
1787         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1788 #endif
1789
1790         return 0;
1791 }
1792
1793 void tcp_v4_destroy_sock(struct sock *sk)
1794 {
1795         struct tcp_sock *tp = tcp_sk(sk);
1796
1797         tcp_clear_xmit_timers(sk);
1798
1799         tcp_cleanup_congestion_control(sk);
1800
1801         /* Cleanup up the write buffer. */
1802         tcp_write_queue_purge(sk);
1803
1804         /* Cleans up our, hopefully empty, out_of_order_queue. */
1805         __skb_queue_purge(&tp->out_of_order_queue);
1806
1807 #ifdef CONFIG_TCP_MD5SIG
1808         /* Clean up the MD5 key list, if any */
1809         if (tp->md5sig_info) {
1810                 tcp_clear_md5_list(sk);
1811                 kfree_rcu(tp->md5sig_info, rcu);
1812                 tp->md5sig_info = NULL;
1813         }
1814 #endif
1815
1816         /* Clean prequeue, it must be empty really */
1817         __skb_queue_purge(&tp->ucopy.prequeue);
1818
1819         /* Clean up a referenced TCP bind bucket. */
1820         if (inet_csk(sk)->icsk_bind_hash)
1821                 inet_put_port(sk);
1822
1823         BUG_ON(tp->fastopen_rsk);
1824
1825         /* If socket is aborted during connect operation */
1826         tcp_free_fastopen_req(tp);
1827         tcp_saved_syn_free(tp);
1828
1829         sk_sockets_allocated_dec(sk);
1830
1831         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1832                 sock_release_memcg(sk);
1833 }
1834 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1835
1836 #ifdef CONFIG_PROC_FS
1837 /* Proc filesystem TCP sock list dumping. */
1838
1839 /*
1840  * Get next listener socket follow cur.  If cur is NULL, get first socket
1841  * starting from bucket given in st->bucket; when st->bucket is zero the
1842  * very first socket in the hash table is returned.
1843  */
1844 static void *listening_get_next(struct seq_file *seq, void *cur)
1845 {
1846         struct inet_connection_sock *icsk;
1847         struct hlist_nulls_node *node;
1848         struct sock *sk = cur;
1849         struct inet_listen_hashbucket *ilb;
1850         struct tcp_iter_state *st = seq->private;
1851         struct net *net = seq_file_net(seq);
1852
1853         if (!sk) {
1854                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1855                 spin_lock_bh(&ilb->lock);
1856                 sk = sk_nulls_head(&ilb->head);
1857                 st->offset = 0;
1858                 goto get_sk;
1859         }
1860         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1861         ++st->num;
1862         ++st->offset;
1863
1864         sk = sk_nulls_next(sk);
1865 get_sk:
1866         sk_nulls_for_each_from(sk, node) {
1867                 if (!net_eq(sock_net(sk), net))
1868                         continue;
1869                 if (sk->sk_family == st->family) {
1870                         cur = sk;
1871                         goto out;
1872                 }
1873                 icsk = inet_csk(sk);
1874         }
1875         spin_unlock_bh(&ilb->lock);
1876         st->offset = 0;
1877         if (++st->bucket < INET_LHTABLE_SIZE) {
1878                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1879                 spin_lock_bh(&ilb->lock);
1880                 sk = sk_nulls_head(&ilb->head);
1881                 goto get_sk;
1882         }
1883         cur = NULL;
1884 out:
1885         return cur;
1886 }
1887
1888 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1889 {
1890         struct tcp_iter_state *st = seq->private;
1891         void *rc;
1892
1893         st->bucket = 0;
1894         st->offset = 0;
1895         rc = listening_get_next(seq, NULL);
1896
1897         while (rc && *pos) {
1898                 rc = listening_get_next(seq, rc);
1899                 --*pos;
1900         }
1901         return rc;
1902 }
1903
1904 static inline bool empty_bucket(const struct tcp_iter_state *st)
1905 {
1906         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1907 }
1908
1909 /*
1910  * Get first established socket starting from bucket given in st->bucket.
1911  * If st->bucket is zero, the very first socket in the hash is returned.
1912  */
1913 static void *established_get_first(struct seq_file *seq)
1914 {
1915         struct tcp_iter_state *st = seq->private;
1916         struct net *net = seq_file_net(seq);
1917         void *rc = NULL;
1918
1919         st->offset = 0;
1920         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1921                 struct sock *sk;
1922                 struct hlist_nulls_node *node;
1923                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1924
1925                 /* Lockless fast path for the common case of empty buckets */
1926                 if (empty_bucket(st))
1927                         continue;
1928
1929                 spin_lock_bh(lock);
1930                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1931                         if (sk->sk_family != st->family ||
1932                             !net_eq(sock_net(sk), net)) {
1933                                 continue;
1934                         }
1935                         rc = sk;
1936                         goto out;
1937                 }
1938                 spin_unlock_bh(lock);
1939         }
1940 out:
1941         return rc;
1942 }
1943
1944 static void *established_get_next(struct seq_file *seq, void *cur)
1945 {
1946         struct sock *sk = cur;
1947         struct hlist_nulls_node *node;
1948         struct tcp_iter_state *st = seq->private;
1949         struct net *net = seq_file_net(seq);
1950
1951         ++st->num;
1952         ++st->offset;
1953
1954         sk = sk_nulls_next(sk);
1955
1956         sk_nulls_for_each_from(sk, node) {
1957                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1958                         return sk;
1959         }
1960
1961         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1962         ++st->bucket;
1963         return established_get_first(seq);
1964 }
1965
1966 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1967 {
1968         struct tcp_iter_state *st = seq->private;
1969         void *rc;
1970
1971         st->bucket = 0;
1972         rc = established_get_first(seq);
1973
1974         while (rc && pos) {
1975                 rc = established_get_next(seq, rc);
1976                 --pos;
1977         }
1978         return rc;
1979 }
1980
1981 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1982 {
1983         void *rc;
1984         struct tcp_iter_state *st = seq->private;
1985
1986         st->state = TCP_SEQ_STATE_LISTENING;
1987         rc        = listening_get_idx(seq, &pos);
1988
1989         if (!rc) {
1990                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1991                 rc        = established_get_idx(seq, pos);
1992         }
1993
1994         return rc;
1995 }
1996
1997 static void *tcp_seek_last_pos(struct seq_file *seq)
1998 {
1999         struct tcp_iter_state *st = seq->private;
2000         int offset = st->offset;
2001         int orig_num = st->num;
2002         void *rc = NULL;
2003
2004         switch (st->state) {
2005         case TCP_SEQ_STATE_LISTENING:
2006                 if (st->bucket >= INET_LHTABLE_SIZE)
2007                         break;
2008                 st->state = TCP_SEQ_STATE_LISTENING;
2009                 rc = listening_get_next(seq, NULL);
2010                 while (offset-- && rc)
2011                         rc = listening_get_next(seq, rc);
2012                 if (rc)
2013                         break;
2014                 st->bucket = 0;
2015                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2016                 /* Fallthrough */
2017         case TCP_SEQ_STATE_ESTABLISHED:
2018                 if (st->bucket > tcp_hashinfo.ehash_mask)
2019                         break;
2020                 rc = established_get_first(seq);
2021                 while (offset-- && rc)
2022                         rc = established_get_next(seq, rc);
2023         }
2024
2025         st->num = orig_num;
2026
2027         return rc;
2028 }
2029
2030 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2031 {
2032         struct tcp_iter_state *st = seq->private;
2033         void *rc;
2034
2035         if (*pos && *pos == st->last_pos) {
2036                 rc = tcp_seek_last_pos(seq);
2037                 if (rc)
2038                         goto out;
2039         }
2040
2041         st->state = TCP_SEQ_STATE_LISTENING;
2042         st->num = 0;
2043         st->bucket = 0;
2044         st->offset = 0;
2045         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2046
2047 out:
2048         st->last_pos = *pos;
2049         return rc;
2050 }
2051
2052 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2053 {
2054         struct tcp_iter_state *st = seq->private;
2055         void *rc = NULL;
2056
2057         if (v == SEQ_START_TOKEN) {
2058                 rc = tcp_get_idx(seq, 0);
2059                 goto out;
2060         }
2061
2062         switch (st->state) {
2063         case TCP_SEQ_STATE_LISTENING:
2064                 rc = listening_get_next(seq, v);
2065                 if (!rc) {
2066                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2067                         st->bucket = 0;
2068                         st->offset = 0;
2069                         rc        = established_get_first(seq);
2070                 }
2071                 break;
2072         case TCP_SEQ_STATE_ESTABLISHED:
2073                 rc = established_get_next(seq, v);
2074                 break;
2075         }
2076 out:
2077         ++*pos;
2078         st->last_pos = *pos;
2079         return rc;
2080 }
2081
2082 static void tcp_seq_stop(struct seq_file *seq, void *v)
2083 {
2084         struct tcp_iter_state *st = seq->private;
2085
2086         switch (st->state) {
2087         case TCP_SEQ_STATE_LISTENING:
2088                 if (v != SEQ_START_TOKEN)
2089                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2090                 break;
2091         case TCP_SEQ_STATE_ESTABLISHED:
2092                 if (v)
2093                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2094                 break;
2095         }
2096 }
2097
2098 int tcp_seq_open(struct inode *inode, struct file *file)
2099 {
2100         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2101         struct tcp_iter_state *s;
2102         int err;
2103
2104         err = seq_open_net(inode, file, &afinfo->seq_ops,
2105                           sizeof(struct tcp_iter_state));
2106         if (err < 0)
2107                 return err;
2108
2109         s = ((struct seq_file *)file->private_data)->private;
2110         s->family               = afinfo->family;
2111         s->last_pos             = 0;
2112         return 0;
2113 }
2114 EXPORT_SYMBOL(tcp_seq_open);
2115
2116 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2117 {
2118         int rc = 0;
2119         struct proc_dir_entry *p;
2120
2121         afinfo->seq_ops.start           = tcp_seq_start;
2122         afinfo->seq_ops.next            = tcp_seq_next;
2123         afinfo->seq_ops.stop            = tcp_seq_stop;
2124
2125         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2126                              afinfo->seq_fops, afinfo);
2127         if (!p)
2128                 rc = -ENOMEM;
2129         return rc;
2130 }
2131 EXPORT_SYMBOL(tcp_proc_register);
2132
2133 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2134 {
2135         remove_proc_entry(afinfo->name, net->proc_net);
2136 }
2137 EXPORT_SYMBOL(tcp_proc_unregister);
2138
2139 static void get_openreq4(const struct request_sock *req,
2140                          struct seq_file *f, int i)
2141 {
2142         const struct inet_request_sock *ireq = inet_rsk(req);
2143         long delta = req->rsk_timer.expires - jiffies;
2144
2145         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2146                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2147                 i,
2148                 ireq->ir_loc_addr,
2149                 ireq->ir_num,
2150                 ireq->ir_rmt_addr,
2151                 ntohs(ireq->ir_rmt_port),
2152                 TCP_SYN_RECV,
2153                 0, 0, /* could print option size, but that is af dependent. */
2154                 1,    /* timers active (only the expire timer) */
2155                 jiffies_delta_to_clock_t(delta),
2156                 req->num_timeout,
2157                 from_kuid_munged(seq_user_ns(f),
2158                                  sock_i_uid(req->rsk_listener)),
2159                 0,  /* non standard timer */
2160                 0, /* open_requests have no inode */
2161                 0,
2162                 req);
2163 }
2164
2165 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2166 {
2167         int timer_active;
2168         unsigned long timer_expires;
2169         const struct tcp_sock *tp = tcp_sk(sk);
2170         const struct inet_connection_sock *icsk = inet_csk(sk);
2171         const struct inet_sock *inet = inet_sk(sk);
2172         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2173         __be32 dest = inet->inet_daddr;
2174         __be32 src = inet->inet_rcv_saddr;
2175         __u16 destp = ntohs(inet->inet_dport);
2176         __u16 srcp = ntohs(inet->inet_sport);
2177         int rx_queue;
2178         int state;
2179
2180         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2181             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2182             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2183                 timer_active    = 1;
2184                 timer_expires   = icsk->icsk_timeout;
2185         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2186                 timer_active    = 4;
2187                 timer_expires   = icsk->icsk_timeout;
2188         } else if (timer_pending(&sk->sk_timer)) {
2189                 timer_active    = 2;
2190                 timer_expires   = sk->sk_timer.expires;
2191         } else {
2192                 timer_active    = 0;
2193                 timer_expires = jiffies;
2194         }
2195
2196         state = sk_state_load(sk);
2197         if (state == TCP_LISTEN)
2198                 rx_queue = sk->sk_ack_backlog;
2199         else
2200                 /* Because we don't lock the socket,
2201                  * we might find a transient negative value.
2202                  */
2203                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2204
2205         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2206                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2207                 i, src, srcp, dest, destp, state,
2208                 tp->write_seq - tp->snd_una,
2209                 rx_queue,
2210                 timer_active,
2211                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2212                 icsk->icsk_retransmits,
2213                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2214                 icsk->icsk_probes_out,
2215                 sock_i_ino(sk),
2216                 atomic_read(&sk->sk_refcnt), sk,
2217                 jiffies_to_clock_t(icsk->icsk_rto),
2218                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2219                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2220                 tp->snd_cwnd,
2221                 state == TCP_LISTEN ?
2222                     fastopenq->max_qlen :
2223                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2224 }
2225
2226 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2227                                struct seq_file *f, int i)
2228 {
2229         long delta = tw->tw_timer.expires - jiffies;
2230         __be32 dest, src;
2231         __u16 destp, srcp;
2232
2233         dest  = tw->tw_daddr;
2234         src   = tw->tw_rcv_saddr;
2235         destp = ntohs(tw->tw_dport);
2236         srcp  = ntohs(tw->tw_sport);
2237
2238         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2239                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2240                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2241                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2242                 atomic_read(&tw->tw_refcnt), tw);
2243 }
2244
2245 #define TMPSZ 150
2246
2247 static int tcp4_seq_show(struct seq_file *seq, void *v)
2248 {
2249         struct tcp_iter_state *st;
2250         struct sock *sk = v;
2251
2252         seq_setwidth(seq, TMPSZ - 1);
2253         if (v == SEQ_START_TOKEN) {
2254                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2255                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2256                            "inode");
2257                 goto out;
2258         }
2259         st = seq->private;
2260
2261         if (sk->sk_state == TCP_TIME_WAIT)
2262                 get_timewait4_sock(v, seq, st->num);
2263         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2264                 get_openreq4(v, seq, st->num);
2265         else
2266                 get_tcp4_sock(v, seq, st->num);
2267 out:
2268         seq_pad(seq, '\n');
2269         return 0;
2270 }
2271
2272 static const struct file_operations tcp_afinfo_seq_fops = {
2273         .owner   = THIS_MODULE,
2274         .open    = tcp_seq_open,
2275         .read    = seq_read,
2276         .llseek  = seq_lseek,
2277         .release = seq_release_net
2278 };
2279
2280 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2281         .name           = "tcp",
2282         .family         = AF_INET,
2283         .seq_fops       = &tcp_afinfo_seq_fops,
2284         .seq_ops        = {
2285                 .show           = tcp4_seq_show,
2286         },
2287 };
2288
2289 static int __net_init tcp4_proc_init_net(struct net *net)
2290 {
2291         return tcp_proc_register(net, &tcp4_seq_afinfo);
2292 }
2293
2294 static void __net_exit tcp4_proc_exit_net(struct net *net)
2295 {
2296         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2297 }
2298
2299 static struct pernet_operations tcp4_net_ops = {
2300         .init = tcp4_proc_init_net,
2301         .exit = tcp4_proc_exit_net,
2302 };
2303
2304 int __init tcp4_proc_init(void)
2305 {
2306         return register_pernet_subsys(&tcp4_net_ops);
2307 }
2308
2309 void tcp4_proc_exit(void)
2310 {
2311         unregister_pernet_subsys(&tcp4_net_ops);
2312 }
2313 #endif /* CONFIG_PROC_FS */
2314
2315 struct proto tcp_prot = {
2316         .name                   = "TCP",
2317         .owner                  = THIS_MODULE,
2318         .close                  = tcp_close,
2319         .connect                = tcp_v4_connect,
2320         .disconnect             = tcp_disconnect,
2321         .accept                 = inet_csk_accept,
2322         .ioctl                  = tcp_ioctl,
2323         .init                   = tcp_v4_init_sock,
2324         .destroy                = tcp_v4_destroy_sock,
2325         .shutdown               = tcp_shutdown,
2326         .setsockopt             = tcp_setsockopt,
2327         .getsockopt             = tcp_getsockopt,
2328         .recvmsg                = tcp_recvmsg,
2329         .sendmsg                = tcp_sendmsg,
2330         .sendpage               = tcp_sendpage,
2331         .backlog_rcv            = tcp_v4_do_rcv,
2332         .release_cb             = tcp_release_cb,
2333         .hash                   = inet_hash,
2334         .unhash                 = inet_unhash,
2335         .get_port               = inet_csk_get_port,
2336         .enter_memory_pressure  = tcp_enter_memory_pressure,
2337         .stream_memory_free     = tcp_stream_memory_free,
2338         .sockets_allocated      = &tcp_sockets_allocated,
2339         .orphan_count           = &tcp_orphan_count,
2340         .memory_allocated       = &tcp_memory_allocated,
2341         .memory_pressure        = &tcp_memory_pressure,
2342         .sysctl_mem             = sysctl_tcp_mem,
2343         .sysctl_wmem            = sysctl_tcp_wmem,
2344         .sysctl_rmem            = sysctl_tcp_rmem,
2345         .max_header             = MAX_TCP_HEADER,
2346         .obj_size               = sizeof(struct tcp_sock),
2347         .slab_flags             = SLAB_DESTROY_BY_RCU,
2348         .twsk_prot              = &tcp_timewait_sock_ops,
2349         .rsk_prot               = &tcp_request_sock_ops,
2350         .h.hashinfo             = &tcp_hashinfo,
2351         .no_autobind            = true,
2352 #ifdef CONFIG_COMPAT
2353         .compat_setsockopt      = compat_tcp_setsockopt,
2354         .compat_getsockopt      = compat_tcp_getsockopt,
2355 #endif
2356         .diag_destroy           = tcp_abort,
2357 };
2358 EXPORT_SYMBOL(tcp_prot);
2359
2360 static void __net_exit tcp_sk_exit(struct net *net)
2361 {
2362         int cpu;
2363
2364         for_each_possible_cpu(cpu)
2365                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2366         free_percpu(net->ipv4.tcp_sk);
2367 }
2368
2369 static int __net_init tcp_sk_init(struct net *net)
2370 {
2371         int res, cpu;
2372
2373         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2374         if (!net->ipv4.tcp_sk)
2375                 return -ENOMEM;
2376
2377         for_each_possible_cpu(cpu) {
2378                 struct sock *sk;
2379
2380                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2381                                            IPPROTO_TCP, net);
2382                 if (res)
2383                         goto fail;
2384                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2385         }
2386
2387         net->ipv4.sysctl_tcp_ecn = 2;
2388         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2389
2390         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2391         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2392         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2393
2394         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2395         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2396         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2397
2398         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2399         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2400         net->ipv4.sysctl_tcp_syncookies = 1;
2401         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2402         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2403         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2404         net->ipv4.sysctl_tcp_orphan_retries = 0;
2405         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2406         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2407
2408         return 0;
2409 fail:
2410         tcp_sk_exit(net);
2411
2412         return res;
2413 }
2414
2415 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2416 {
2417         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2418 }
2419
2420 static struct pernet_operations __net_initdata tcp_sk_ops = {
2421        .init       = tcp_sk_init,
2422        .exit       = tcp_sk_exit,
2423        .exit_batch = tcp_sk_exit_batch,
2424 };
2425
2426 void __init tcp_v4_init(void)
2427 {
2428         inet_hashinfo_init(&tcp_hashinfo);
2429         if (register_pernet_subsys(&tcp_sk_ops))
2430                 panic("Failed to create the TCP control socket.\n");
2431 }