]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/tcp_ipv4.c
Merge remote-tracking branch 'ext3/for_next'
[karo-tx-linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              sock_owned_by_user(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223
224         sk_set_txhash(sk);
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286
287         mtu = dst_mtu(dst);
288
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         WARN_ON(req->sk);
323
324         if (seq != tcp_rsk(req)->snt_isn) {
325                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326         } else if (abort) {
327                 /*
328                  * Still in SYN_RECV, just remove it silently.
329                  * There is no good way to pass the error to the newly
330                  * created socket, and POSIX does not want network
331                  * errors returned from accept().
332                  */
333                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335         }
336         reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360         struct inet_connection_sock *icsk;
361         struct tcp_sock *tp;
362         struct inet_sock *inet;
363         const int type = icmp_hdr(icmp_skb)->type;
364         const int code = icmp_hdr(icmp_skb)->code;
365         struct sock *sk;
366         struct sk_buff *skb;
367         struct request_sock *fastopen;
368         __u32 seq, snd_una;
369         __u32 remaining;
370         int err;
371         struct net *net = dev_net(icmp_skb->dev);
372
373         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374                                        th->dest, iph->saddr, ntohs(th->source),
375                                        inet_iif(icmp_skb));
376         if (!sk) {
377                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378                 return;
379         }
380         if (sk->sk_state == TCP_TIME_WAIT) {
381                 inet_twsk_put(inet_twsk(sk));
382                 return;
383         }
384         seq = ntohl(th->seq);
385         if (sk->sk_state == TCP_NEW_SYN_RECV)
386                 return tcp_req_err(sk, seq,
387                                   type == ICMP_PARAMETERPROB ||
388                                   type == ICMP_TIME_EXCEEDED ||
389                                   (type == ICMP_DEST_UNREACH &&
390                                    (code == ICMP_NET_UNREACH ||
391                                     code == ICMP_HOST_UNREACH)));
392
393         bh_lock_sock(sk);
394         /* If too many ICMPs get dropped on busy
395          * servers this needs to be solved differently.
396          * We do take care of PMTU discovery (RFC1191) special case :
397          * we can receive locally generated ICMP messages while socket is held.
398          */
399         if (sock_owned_by_user(sk)) {
400                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
401                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
402         }
403         if (sk->sk_state == TCP_CLOSE)
404                 goto out;
405
406         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
407                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
408                 goto out;
409         }
410
411         icsk = inet_csk(sk);
412         tp = tcp_sk(sk);
413         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
414         fastopen = tp->fastopen_rsk;
415         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
416         if (sk->sk_state != TCP_LISTEN &&
417             !between(seq, snd_una, tp->snd_nxt)) {
418                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
419                 goto out;
420         }
421
422         switch (type) {
423         case ICMP_REDIRECT:
424                 do_redirect(icmp_skb, sk);
425                 goto out;
426         case ICMP_SOURCE_QUENCH:
427                 /* Just silently ignore these. */
428                 goto out;
429         case ICMP_PARAMETERPROB:
430                 err = EPROTO;
431                 break;
432         case ICMP_DEST_UNREACH:
433                 if (code > NR_ICMP_UNREACH)
434                         goto out;
435
436                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
437                         /* We are not interested in TCP_LISTEN and open_requests
438                          * (SYN-ACKs send out by Linux are always <576bytes so
439                          * they should go through unfragmented).
440                          */
441                         if (sk->sk_state == TCP_LISTEN)
442                                 goto out;
443
444                         tp->mtu_info = info;
445                         if (!sock_owned_by_user(sk)) {
446                                 tcp_v4_mtu_reduced(sk);
447                         } else {
448                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
449                                         sock_hold(sk);
450                         }
451                         goto out;
452                 }
453
454                 err = icmp_err_convert[code].errno;
455                 /* check if icmp_skb allows revert of backoff
456                  * (see draft-zimmermann-tcp-lcd) */
457                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
458                         break;
459                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
460                     !icsk->icsk_backoff || fastopen)
461                         break;
462
463                 if (sock_owned_by_user(sk))
464                         break;
465
466                 icsk->icsk_backoff--;
467                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
468                                                TCP_TIMEOUT_INIT;
469                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
470
471                 skb = tcp_write_queue_head(sk);
472                 BUG_ON(!skb);
473
474                 remaining = icsk->icsk_rto -
475                             min(icsk->icsk_rto,
476                                 tcp_time_stamp - tcp_skb_timestamp(skb));
477
478                 if (remaining) {
479                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
480                                                   remaining, TCP_RTO_MAX);
481                 } else {
482                         /* RTO revert clocked out retransmission.
483                          * Will retransmit now */
484                         tcp_retransmit_timer(sk);
485                 }
486
487                 break;
488         case ICMP_TIME_EXCEEDED:
489                 err = EHOSTUNREACH;
490                 break;
491         default:
492                 goto out;
493         }
494
495         switch (sk->sk_state) {
496         case TCP_SYN_SENT:
497         case TCP_SYN_RECV:
498                 /* Only in fast or simultaneous open. If a fast open socket is
499                  * is already accepted it is treated as a connected one below.
500                  */
501                 if (fastopen && !fastopen->sk)
502                         break;
503
504                 if (!sock_owned_by_user(sk)) {
505                         sk->sk_err = err;
506
507                         sk->sk_error_report(sk);
508
509                         tcp_done(sk);
510                 } else {
511                         sk->sk_err_soft = err;
512                 }
513                 goto out;
514         }
515
516         /* If we've already connected we will keep trying
517          * until we time out, or the user gives up.
518          *
519          * rfc1122 4.2.3.9 allows to consider as hard errors
520          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
521          * but it is obsoleted by pmtu discovery).
522          *
523          * Note, that in modern internet, where routing is unreliable
524          * and in each dark corner broken firewalls sit, sending random
525          * errors ordered by their masters even this two messages finally lose
526          * their original sense (even Linux sends invalid PORT_UNREACHs)
527          *
528          * Now we are in compliance with RFCs.
529          *                                                      --ANK (980905)
530          */
531
532         inet = inet_sk(sk);
533         if (!sock_owned_by_user(sk) && inet->recverr) {
534                 sk->sk_err = err;
535                 sk->sk_error_report(sk);
536         } else  { /* Only an error on timeout */
537                 sk->sk_err_soft = err;
538         }
539
540 out:
541         bh_unlock_sock(sk);
542         sock_put(sk);
543 }
544
545 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
546 {
547         struct tcphdr *th = tcp_hdr(skb);
548
549         if (skb->ip_summed == CHECKSUM_PARTIAL) {
550                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
551                 skb->csum_start = skb_transport_header(skb) - skb->head;
552                 skb->csum_offset = offsetof(struct tcphdr, check);
553         } else {
554                 th->check = tcp_v4_check(skb->len, saddr, daddr,
555                                          csum_partial(th,
556                                                       th->doff << 2,
557                                                       skb->csum));
558         }
559 }
560
561 /* This routine computes an IPv4 TCP checksum. */
562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
563 {
564         const struct inet_sock *inet = inet_sk(sk);
565
566         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
567 }
568 EXPORT_SYMBOL(tcp_v4_send_check);
569
570 /*
571  *      This routine will send an RST to the other tcp.
572  *
573  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
574  *                    for reset.
575  *      Answer: if a packet caused RST, it is not for a socket
576  *              existing in our system, if it is matched to a socket,
577  *              it is just duplicate segment or bug in other side's TCP.
578  *              So that we build reply only basing on parameters
579  *              arrived with segment.
580  *      Exception: precedence violation. We do not implement it in any case.
581  */
582
583 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
584 {
585         const struct tcphdr *th = tcp_hdr(skb);
586         struct {
587                 struct tcphdr th;
588 #ifdef CONFIG_TCP_MD5SIG
589                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
590 #endif
591         } rep;
592         struct ip_reply_arg arg;
593 #ifdef CONFIG_TCP_MD5SIG
594         struct tcp_md5sig_key *key = NULL;
595         const __u8 *hash_location = NULL;
596         unsigned char newhash[16];
597         int genhash;
598         struct sock *sk1 = NULL;
599 #endif
600         struct net *net;
601
602         /* Never send a reset in response to a reset. */
603         if (th->rst)
604                 return;
605
606         /* If sk not NULL, it means we did a successful lookup and incoming
607          * route had to be correct. prequeue might have dropped our dst.
608          */
609         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
610                 return;
611
612         /* Swap the send and the receive. */
613         memset(&rep, 0, sizeof(rep));
614         rep.th.dest   = th->source;
615         rep.th.source = th->dest;
616         rep.th.doff   = sizeof(struct tcphdr) / 4;
617         rep.th.rst    = 1;
618
619         if (th->ack) {
620                 rep.th.seq = th->ack_seq;
621         } else {
622                 rep.th.ack = 1;
623                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624                                        skb->len - (th->doff << 2));
625         }
626
627         memset(&arg, 0, sizeof(arg));
628         arg.iov[0].iov_base = (unsigned char *)&rep;
629         arg.iov[0].iov_len  = sizeof(rep.th);
630
631         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
632 #ifdef CONFIG_TCP_MD5SIG
633         hash_location = tcp_parse_md5sig_option(th);
634         if (sk && sk_fullsock(sk)) {
635                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
636                                         &ip_hdr(skb)->saddr, AF_INET);
637         } else if (hash_location) {
638                 /*
639                  * active side is lost. Try to find listening socket through
640                  * source port, and then find md5 key through listening socket.
641                  * we are not loose security here:
642                  * Incoming packet is checked with md5 hash with finding key,
643                  * no RST generated if md5 hash doesn't match.
644                  */
645                 sk1 = __inet_lookup_listener(net,
646                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
647                                              th->source, ip_hdr(skb)->daddr,
648                                              ntohs(th->source), inet_iif(skb));
649                 /* don't send rst if it can't find key */
650                 if (!sk1)
651                         return;
652                 rcu_read_lock();
653                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
654                                         &ip_hdr(skb)->saddr, AF_INET);
655                 if (!key)
656                         goto release_sk1;
657
658                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
659                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
660                         goto release_sk1;
661         }
662
663         if (key) {
664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
665                                    (TCPOPT_NOP << 16) |
666                                    (TCPOPT_MD5SIG << 8) |
667                                    TCPOLEN_MD5SIG);
668                 /* Update length and the length the header thinks exists */
669                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
670                 rep.th.doff = arg.iov[0].iov_len / 4;
671
672                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
673                                      key, ip_hdr(skb)->saddr,
674                                      ip_hdr(skb)->daddr, &rep.th);
675         }
676 #endif
677         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
678                                       ip_hdr(skb)->saddr, /* XXX */
679                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
680         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
681         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
682
683         /* When socket is gone, all binding information is lost.
684          * routing might fail in this case. No choice here, if we choose to force
685          * input interface, we will misroute in case of asymmetric route.
686          */
687         if (sk)
688                 arg.bound_dev_if = sk->sk_bound_dev_if;
689
690         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
691                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
692
693         arg.tos = ip_hdr(skb)->tos;
694         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
695                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
696                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
697                               &arg, arg.iov[0].iov_len);
698
699         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
700         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
701
702 #ifdef CONFIG_TCP_MD5SIG
703 release_sk1:
704         if (sk1) {
705                 rcu_read_unlock();
706                 sock_put(sk1);
707         }
708 #endif
709 }
710
711 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
712    outside socket context is ugly, certainly. What can I do?
713  */
714
715 static void tcp_v4_send_ack(struct net *net,
716                             struct sk_buff *skb, u32 seq, u32 ack,
717                             u32 win, u32 tsval, u32 tsecr, int oif,
718                             struct tcp_md5sig_key *key,
719                             int reply_flags, u8 tos)
720 {
721         const struct tcphdr *th = tcp_hdr(skb);
722         struct {
723                 struct tcphdr th;
724                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
725 #ifdef CONFIG_TCP_MD5SIG
726                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
727 #endif
728                         ];
729         } rep;
730         struct ip_reply_arg arg;
731
732         memset(&rep.th, 0, sizeof(struct tcphdr));
733         memset(&arg, 0, sizeof(arg));
734
735         arg.iov[0].iov_base = (unsigned char *)&rep;
736         arg.iov[0].iov_len  = sizeof(rep.th);
737         if (tsecr) {
738                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
739                                    (TCPOPT_TIMESTAMP << 8) |
740                                    TCPOLEN_TIMESTAMP);
741                 rep.opt[1] = htonl(tsval);
742                 rep.opt[2] = htonl(tsecr);
743                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
744         }
745
746         /* Swap the send and the receive. */
747         rep.th.dest    = th->source;
748         rep.th.source  = th->dest;
749         rep.th.doff    = arg.iov[0].iov_len / 4;
750         rep.th.seq     = htonl(seq);
751         rep.th.ack_seq = htonl(ack);
752         rep.th.ack     = 1;
753         rep.th.window  = htons(win);
754
755 #ifdef CONFIG_TCP_MD5SIG
756         if (key) {
757                 int offset = (tsecr) ? 3 : 0;
758
759                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
760                                           (TCPOPT_NOP << 16) |
761                                           (TCPOPT_MD5SIG << 8) |
762                                           TCPOLEN_MD5SIG);
763                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
764                 rep.th.doff = arg.iov[0].iov_len/4;
765
766                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
767                                     key, ip_hdr(skb)->saddr,
768                                     ip_hdr(skb)->daddr, &rep.th);
769         }
770 #endif
771         arg.flags = reply_flags;
772         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773                                       ip_hdr(skb)->saddr, /* XXX */
774                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
775         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776         if (oif)
777                 arg.bound_dev_if = oif;
778         arg.tos = tos;
779         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
780                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
781                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
782                               &arg, arg.iov[0].iov_len);
783
784         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
785 }
786
787 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
788 {
789         struct inet_timewait_sock *tw = inet_twsk(sk);
790         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
791
792         tcp_v4_send_ack(sock_net(sk), skb,
793                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
794                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
795                         tcp_time_stamp + tcptw->tw_ts_offset,
796                         tcptw->tw_ts_recent,
797                         tw->tw_bound_dev_if,
798                         tcp_twsk_md5_key(tcptw),
799                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
800                         tw->tw_tos
801                         );
802
803         inet_twsk_put(tw);
804 }
805
806 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
807                                   struct request_sock *req)
808 {
809         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
810          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
811          */
812         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
813                                              tcp_sk(sk)->snd_nxt;
814
815         tcp_v4_send_ack(sock_net(sk), skb, seq,
816                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
817                         tcp_time_stamp,
818                         req->ts_recent,
819                         0,
820                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
821                                           AF_INET),
822                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
823                         ip_hdr(skb)->tos);
824 }
825
826 /*
827  *      Send a SYN-ACK after having received a SYN.
828  *      This still operates on a request_sock only, not on a big
829  *      socket.
830  */
831 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
832                               struct flowi *fl,
833                               struct request_sock *req,
834                               struct tcp_fastopen_cookie *foc,
835                                   bool attach_req)
836 {
837         const struct inet_request_sock *ireq = inet_rsk(req);
838         struct flowi4 fl4;
839         int err = -1;
840         struct sk_buff *skb;
841
842         /* First, grab a route. */
843         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
844                 return -1;
845
846         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
847
848         if (skb) {
849                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
850
851                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
852                                             ireq->ir_rmt_addr,
853                                             ireq->opt);
854                 err = net_xmit_eval(err);
855         }
856
857         return err;
858 }
859
860 /*
861  *      IPv4 request_sock destructor.
862  */
863 static void tcp_v4_reqsk_destructor(struct request_sock *req)
864 {
865         kfree(inet_rsk(req)->opt);
866 }
867
868
869 #ifdef CONFIG_TCP_MD5SIG
870 /*
871  * RFC2385 MD5 checksumming requires a mapping of
872  * IP address->MD5 Key.
873  * We need to maintain these in the sk structure.
874  */
875
876 /* Find the Key structure for an address.  */
877 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
878                                          const union tcp_md5_addr *addr,
879                                          int family)
880 {
881         const struct tcp_sock *tp = tcp_sk(sk);
882         struct tcp_md5sig_key *key;
883         unsigned int size = sizeof(struct in_addr);
884         const struct tcp_md5sig_info *md5sig;
885
886         /* caller either holds rcu_read_lock() or socket lock */
887         md5sig = rcu_dereference_check(tp->md5sig_info,
888                                        sock_owned_by_user(sk) ||
889                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
890         if (!md5sig)
891                 return NULL;
892 #if IS_ENABLED(CONFIG_IPV6)
893         if (family == AF_INET6)
894                 size = sizeof(struct in6_addr);
895 #endif
896         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
897                 if (key->family != family)
898                         continue;
899                 if (!memcmp(&key->addr, addr, size))
900                         return key;
901         }
902         return NULL;
903 }
904 EXPORT_SYMBOL(tcp_md5_do_lookup);
905
906 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
907                                          const struct sock *addr_sk)
908 {
909         const union tcp_md5_addr *addr;
910
911         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
912         return tcp_md5_do_lookup(sk, addr, AF_INET);
913 }
914 EXPORT_SYMBOL(tcp_v4_md5_lookup);
915
916 /* This can be called on a newly created socket, from other files */
917 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
918                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
919 {
920         /* Add Key to the list */
921         struct tcp_md5sig_key *key;
922         struct tcp_sock *tp = tcp_sk(sk);
923         struct tcp_md5sig_info *md5sig;
924
925         key = tcp_md5_do_lookup(sk, addr, family);
926         if (key) {
927                 /* Pre-existing entry - just update that one. */
928                 memcpy(key->key, newkey, newkeylen);
929                 key->keylen = newkeylen;
930                 return 0;
931         }
932
933         md5sig = rcu_dereference_protected(tp->md5sig_info,
934                                            sock_owned_by_user(sk) ||
935                                            lockdep_is_held(&sk->sk_lock.slock));
936         if (!md5sig) {
937                 md5sig = kmalloc(sizeof(*md5sig), gfp);
938                 if (!md5sig)
939                         return -ENOMEM;
940
941                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
942                 INIT_HLIST_HEAD(&md5sig->head);
943                 rcu_assign_pointer(tp->md5sig_info, md5sig);
944         }
945
946         key = sock_kmalloc(sk, sizeof(*key), gfp);
947         if (!key)
948                 return -ENOMEM;
949         if (!tcp_alloc_md5sig_pool()) {
950                 sock_kfree_s(sk, key, sizeof(*key));
951                 return -ENOMEM;
952         }
953
954         memcpy(key->key, newkey, newkeylen);
955         key->keylen = newkeylen;
956         key->family = family;
957         memcpy(&key->addr, addr,
958                (family == AF_INET6) ? sizeof(struct in6_addr) :
959                                       sizeof(struct in_addr));
960         hlist_add_head_rcu(&key->node, &md5sig->head);
961         return 0;
962 }
963 EXPORT_SYMBOL(tcp_md5_do_add);
964
965 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
966 {
967         struct tcp_md5sig_key *key;
968
969         key = tcp_md5_do_lookup(sk, addr, family);
970         if (!key)
971                 return -ENOENT;
972         hlist_del_rcu(&key->node);
973         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
974         kfree_rcu(key, rcu);
975         return 0;
976 }
977 EXPORT_SYMBOL(tcp_md5_do_del);
978
979 static void tcp_clear_md5_list(struct sock *sk)
980 {
981         struct tcp_sock *tp = tcp_sk(sk);
982         struct tcp_md5sig_key *key;
983         struct hlist_node *n;
984         struct tcp_md5sig_info *md5sig;
985
986         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
987
988         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
989                 hlist_del_rcu(&key->node);
990                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
991                 kfree_rcu(key, rcu);
992         }
993 }
994
995 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
996                                  int optlen)
997 {
998         struct tcp_md5sig cmd;
999         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000
1001         if (optlen < sizeof(cmd))
1002                 return -EINVAL;
1003
1004         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1005                 return -EFAULT;
1006
1007         if (sin->sin_family != AF_INET)
1008                 return -EINVAL;
1009
1010         if (!cmd.tcpm_keylen)
1011                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1012                                       AF_INET);
1013
1014         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1015                 return -EINVAL;
1016
1017         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1019                               GFP_KERNEL);
1020 }
1021
1022 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1023                                         __be32 daddr, __be32 saddr, int nbytes)
1024 {
1025         struct tcp4_pseudohdr *bp;
1026         struct scatterlist sg;
1027
1028         bp = &hp->md5_blk.ip4;
1029
1030         /*
1031          * 1. the TCP pseudo-header (in the order: source IP address,
1032          * destination IP address, zero-padded protocol number, and
1033          * segment length)
1034          */
1035         bp->saddr = saddr;
1036         bp->daddr = daddr;
1037         bp->pad = 0;
1038         bp->protocol = IPPROTO_TCP;
1039         bp->len = cpu_to_be16(nbytes);
1040
1041         sg_init_one(&sg, bp, sizeof(*bp));
1042         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1043 }
1044
1045 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047 {
1048         struct tcp_md5sig_pool *hp;
1049         struct hash_desc *desc;
1050
1051         hp = tcp_get_md5sig_pool();
1052         if (!hp)
1053                 goto clear_hash_noput;
1054         desc = &hp->md5_desc;
1055
1056         if (crypto_hash_init(desc))
1057                 goto clear_hash;
1058         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059                 goto clear_hash;
1060         if (tcp_md5_hash_header(hp, th))
1061                 goto clear_hash;
1062         if (tcp_md5_hash_key(hp, key))
1063                 goto clear_hash;
1064         if (crypto_hash_final(desc, md5_hash))
1065                 goto clear_hash;
1066
1067         tcp_put_md5sig_pool();
1068         return 0;
1069
1070 clear_hash:
1071         tcp_put_md5sig_pool();
1072 clear_hash_noput:
1073         memset(md5_hash, 0, 16);
1074         return 1;
1075 }
1076
1077 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1078                         const struct sock *sk,
1079                         const struct sk_buff *skb)
1080 {
1081         struct tcp_md5sig_pool *hp;
1082         struct hash_desc *desc;
1083         const struct tcphdr *th = tcp_hdr(skb);
1084         __be32 saddr, daddr;
1085
1086         if (sk) { /* valid for establish/request sockets */
1087                 saddr = sk->sk_rcv_saddr;
1088                 daddr = sk->sk_daddr;
1089         } else {
1090                 const struct iphdr *iph = ip_hdr(skb);
1091                 saddr = iph->saddr;
1092                 daddr = iph->daddr;
1093         }
1094
1095         hp = tcp_get_md5sig_pool();
1096         if (!hp)
1097                 goto clear_hash_noput;
1098         desc = &hp->md5_desc;
1099
1100         if (crypto_hash_init(desc))
1101                 goto clear_hash;
1102
1103         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_header(hp, th))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1108                 goto clear_hash;
1109         if (tcp_md5_hash_key(hp, key))
1110                 goto clear_hash;
1111         if (crypto_hash_final(desc, md5_hash))
1112                 goto clear_hash;
1113
1114         tcp_put_md5sig_pool();
1115         return 0;
1116
1117 clear_hash:
1118         tcp_put_md5sig_pool();
1119 clear_hash_noput:
1120         memset(md5_hash, 0, 16);
1121         return 1;
1122 }
1123 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1124
1125 #endif
1126
1127 /* Called with rcu_read_lock() */
1128 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1129                                     const struct sk_buff *skb)
1130 {
1131 #ifdef CONFIG_TCP_MD5SIG
1132         /*
1133          * This gets called for each TCP segment that arrives
1134          * so we want to be efficient.
1135          * We have 3 drop cases:
1136          * o No MD5 hash and one expected.
1137          * o MD5 hash and we're not expecting one.
1138          * o MD5 hash and its wrong.
1139          */
1140         const __u8 *hash_location = NULL;
1141         struct tcp_md5sig_key *hash_expected;
1142         const struct iphdr *iph = ip_hdr(skb);
1143         const struct tcphdr *th = tcp_hdr(skb);
1144         int genhash;
1145         unsigned char newhash[16];
1146
1147         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1148                                           AF_INET);
1149         hash_location = tcp_parse_md5sig_option(th);
1150
1151         /* We've parsed the options - do we have a hash? */
1152         if (!hash_expected && !hash_location)
1153                 return false;
1154
1155         if (hash_expected && !hash_location) {
1156                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1157                 return true;
1158         }
1159
1160         if (!hash_expected && hash_location) {
1161                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1162                 return true;
1163         }
1164
1165         /* Okay, so this is hash_expected and hash_location -
1166          * so we need to calculate the checksum.
1167          */
1168         genhash = tcp_v4_md5_hash_skb(newhash,
1169                                       hash_expected,
1170                                       NULL, skb);
1171
1172         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1174                                      &iph->saddr, ntohs(th->source),
1175                                      &iph->daddr, ntohs(th->dest),
1176                                      genhash ? " tcp_v4_calc_md5_hash failed"
1177                                      : "");
1178                 return true;
1179         }
1180         return false;
1181 #endif
1182         return false;
1183 }
1184
1185 static void tcp_v4_init_req(struct request_sock *req,
1186                             const struct sock *sk_listener,
1187                             struct sk_buff *skb)
1188 {
1189         struct inet_request_sock *ireq = inet_rsk(req);
1190
1191         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1192         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1193         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1194         ireq->opt = tcp_v4_save_options(skb);
1195 }
1196
1197 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1198                                           struct flowi *fl,
1199                                           const struct request_sock *req,
1200                                           bool *strict)
1201 {
1202         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1203
1204         if (strict) {
1205                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1206                         *strict = true;
1207                 else
1208                         *strict = false;
1209         }
1210
1211         return dst;
1212 }
1213
1214 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1215         .family         =       PF_INET,
1216         .obj_size       =       sizeof(struct tcp_request_sock),
1217         .rtx_syn_ack    =       tcp_rtx_synack,
1218         .send_ack       =       tcp_v4_reqsk_send_ack,
1219         .destructor     =       tcp_v4_reqsk_destructor,
1220         .send_reset     =       tcp_v4_send_reset,
1221         .syn_ack_timeout =      tcp_syn_ack_timeout,
1222 };
1223
1224 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1225         .mss_clamp      =       TCP_MSS_DEFAULT,
1226 #ifdef CONFIG_TCP_MD5SIG
1227         .req_md5_lookup =       tcp_v4_md5_lookup,
1228         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1229 #endif
1230         .init_req       =       tcp_v4_init_req,
1231 #ifdef CONFIG_SYN_COOKIES
1232         .cookie_init_seq =      cookie_v4_init_sequence,
1233 #endif
1234         .route_req      =       tcp_v4_route_req,
1235         .init_seq       =       tcp_v4_init_sequence,
1236         .send_synack    =       tcp_v4_send_synack,
1237 };
1238
1239 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1240 {
1241         /* Never answer to SYNs send to broadcast or multicast */
1242         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243                 goto drop;
1244
1245         return tcp_conn_request(&tcp_request_sock_ops,
1246                                 &tcp_request_sock_ipv4_ops, sk, skb);
1247
1248 drop:
1249         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1250         return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_v4_conn_request);
1253
1254
1255 /*
1256  * The three way handshake has completed - we got a valid synack -
1257  * now create the new socket.
1258  */
1259 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1260                                   struct request_sock *req,
1261                                   struct dst_entry *dst,
1262                                   struct request_sock *req_unhash,
1263                                   bool *own_req)
1264 {
1265         struct inet_request_sock *ireq;
1266         struct inet_sock *newinet;
1267         struct tcp_sock *newtp;
1268         struct sock *newsk;
1269 #ifdef CONFIG_TCP_MD5SIG
1270         struct tcp_md5sig_key *key;
1271 #endif
1272         struct ip_options_rcu *inet_opt;
1273
1274         if (sk_acceptq_is_full(sk))
1275                 goto exit_overflow;
1276
1277         newsk = tcp_create_openreq_child(sk, req, skb);
1278         if (!newsk)
1279                 goto exit_nonewsk;
1280
1281         newsk->sk_gso_type = SKB_GSO_TCPV4;
1282         inet_sk_rx_dst_set(newsk, skb);
1283
1284         newtp                 = tcp_sk(newsk);
1285         newinet               = inet_sk(newsk);
1286         ireq                  = inet_rsk(req);
1287         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1288         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1289         newsk->sk_bound_dev_if = ireq->ir_iif;
1290         newinet->inet_saddr           = ireq->ir_loc_addr;
1291         inet_opt              = ireq->opt;
1292         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1293         ireq->opt             = NULL;
1294         newinet->mc_index     = inet_iif(skb);
1295         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1296         newinet->rcv_tos      = ip_hdr(skb)->tos;
1297         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1298         if (inet_opt)
1299                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1300         newinet->inet_id = newtp->write_seq ^ jiffies;
1301
1302         if (!dst) {
1303                 dst = inet_csk_route_child_sock(sk, newsk, req);
1304                 if (!dst)
1305                         goto put_and_exit;
1306         } else {
1307                 /* syncookie case : see end of cookie_v4_check() */
1308         }
1309         sk_setup_caps(newsk, dst);
1310
1311         tcp_ca_openreq_child(newsk, dst);
1312
1313         tcp_sync_mss(newsk, dst_mtu(dst));
1314         newtp->advmss = dst_metric_advmss(dst);
1315         if (tcp_sk(sk)->rx_opt.user_mss &&
1316             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1317                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1318
1319         tcp_initialize_rcv_mss(newsk);
1320
1321 #ifdef CONFIG_TCP_MD5SIG
1322         /* Copy over the MD5 key from the original socket */
1323         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1324                                 AF_INET);
1325         if (key) {
1326                 /*
1327                  * We're using one, so create a matching key
1328                  * on the newsk structure. If we fail to get
1329                  * memory, then we end up not copying the key
1330                  * across. Shucks.
1331                  */
1332                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1333                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1334                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1335         }
1336 #endif
1337
1338         if (__inet_inherit_port(sk, newsk) < 0)
1339                 goto put_and_exit;
1340         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1341         if (*own_req)
1342                 tcp_move_syn(newtp, req);
1343
1344         return newsk;
1345
1346 exit_overflow:
1347         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1348 exit_nonewsk:
1349         dst_release(dst);
1350 exit:
1351         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1352         return NULL;
1353 put_and_exit:
1354         inet_csk_prepare_forced_close(newsk);
1355         tcp_done(newsk);
1356         goto exit;
1357 }
1358 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1359
1360 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1361 {
1362 #ifdef CONFIG_SYN_COOKIES
1363         const struct tcphdr *th = tcp_hdr(skb);
1364
1365         if (!th->syn)
1366                 sk = cookie_v4_check(sk, skb);
1367 #endif
1368         return sk;
1369 }
1370
1371 /* The socket must have it's spinlock held when we get
1372  * here, unless it is a TCP_LISTEN socket.
1373  *
1374  * We have a potential double-lock case here, so even when
1375  * doing backlog processing we use the BH locking scheme.
1376  * This is because we cannot sleep with the original spinlock
1377  * held.
1378  */
1379 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1380 {
1381         struct sock *rsk;
1382
1383         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1384                 struct dst_entry *dst = sk->sk_rx_dst;
1385
1386                 sock_rps_save_rxhash(sk, skb);
1387                 sk_mark_napi_id(sk, skb);
1388                 if (dst) {
1389                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1390                             !dst->ops->check(dst, 0)) {
1391                                 dst_release(dst);
1392                                 sk->sk_rx_dst = NULL;
1393                         }
1394                 }
1395                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1396                 return 0;
1397         }
1398
1399         if (tcp_checksum_complete(skb))
1400                 goto csum_err;
1401
1402         if (sk->sk_state == TCP_LISTEN) {
1403                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1404
1405                 if (!nsk)
1406                         goto discard;
1407                 if (nsk != sk) {
1408                         sock_rps_save_rxhash(nsk, skb);
1409                         sk_mark_napi_id(nsk, skb);
1410                         if (tcp_child_process(sk, nsk, skb)) {
1411                                 rsk = nsk;
1412                                 goto reset;
1413                         }
1414                         return 0;
1415                 }
1416         } else
1417                 sock_rps_save_rxhash(sk, skb);
1418
1419         if (tcp_rcv_state_process(sk, skb)) {
1420                 rsk = sk;
1421                 goto reset;
1422         }
1423         return 0;
1424
1425 reset:
1426         tcp_v4_send_reset(rsk, skb);
1427 discard:
1428         kfree_skb(skb);
1429         /* Be careful here. If this function gets more complicated and
1430          * gcc suffers from register pressure on the x86, sk (in %ebx)
1431          * might be destroyed here. This current version compiles correctly,
1432          * but you have been warned.
1433          */
1434         return 0;
1435
1436 csum_err:
1437         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1438         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1439         goto discard;
1440 }
1441 EXPORT_SYMBOL(tcp_v4_do_rcv);
1442
1443 void tcp_v4_early_demux(struct sk_buff *skb)
1444 {
1445         const struct iphdr *iph;
1446         const struct tcphdr *th;
1447         struct sock *sk;
1448
1449         if (skb->pkt_type != PACKET_HOST)
1450                 return;
1451
1452         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1453                 return;
1454
1455         iph = ip_hdr(skb);
1456         th = tcp_hdr(skb);
1457
1458         if (th->doff < sizeof(struct tcphdr) / 4)
1459                 return;
1460
1461         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1462                                        iph->saddr, th->source,
1463                                        iph->daddr, ntohs(th->dest),
1464                                        skb->skb_iif);
1465         if (sk) {
1466                 skb->sk = sk;
1467                 skb->destructor = sock_edemux;
1468                 if (sk_fullsock(sk)) {
1469                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1470
1471                         if (dst)
1472                                 dst = dst_check(dst, 0);
1473                         if (dst &&
1474                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1475                                 skb_dst_set_noref(skb, dst);
1476                 }
1477         }
1478 }
1479
1480 /* Packet is added to VJ-style prequeue for processing in process
1481  * context, if a reader task is waiting. Apparently, this exciting
1482  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1483  * failed somewhere. Latency? Burstiness? Well, at least now we will
1484  * see, why it failed. 8)8)                               --ANK
1485  *
1486  */
1487 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1488 {
1489         struct tcp_sock *tp = tcp_sk(sk);
1490
1491         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1492                 return false;
1493
1494         if (skb->len <= tcp_hdrlen(skb) &&
1495             skb_queue_len(&tp->ucopy.prequeue) == 0)
1496                 return false;
1497
1498         /* Before escaping RCU protected region, we need to take care of skb
1499          * dst. Prequeue is only enabled for established sockets.
1500          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1501          * Instead of doing full sk_rx_dst validity here, let's perform
1502          * an optimistic check.
1503          */
1504         if (likely(sk->sk_rx_dst))
1505                 skb_dst_drop(skb);
1506         else
1507                 skb_dst_force_safe(skb);
1508
1509         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1510         tp->ucopy.memory += skb->truesize;
1511         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1512                 struct sk_buff *skb1;
1513
1514                 BUG_ON(sock_owned_by_user(sk));
1515
1516                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1517                         sk_backlog_rcv(sk, skb1);
1518                         NET_INC_STATS_BH(sock_net(sk),
1519                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1520                 }
1521
1522                 tp->ucopy.memory = 0;
1523         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1524                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1525                                            POLLIN | POLLRDNORM | POLLRDBAND);
1526                 if (!inet_csk_ack_scheduled(sk))
1527                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1528                                                   (3 * tcp_rto_min(sk)) / 4,
1529                                                   TCP_RTO_MAX);
1530         }
1531         return true;
1532 }
1533 EXPORT_SYMBOL(tcp_prequeue);
1534
1535 /*
1536  *      From tcp_input.c
1537  */
1538
1539 int tcp_v4_rcv(struct sk_buff *skb)
1540 {
1541         const struct iphdr *iph;
1542         const struct tcphdr *th;
1543         struct sock *sk;
1544         int ret;
1545         struct net *net = dev_net(skb->dev);
1546
1547         if (skb->pkt_type != PACKET_HOST)
1548                 goto discard_it;
1549
1550         /* Count it even if it's bad */
1551         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1552
1553         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554                 goto discard_it;
1555
1556         th = tcp_hdr(skb);
1557
1558         if (th->doff < sizeof(struct tcphdr) / 4)
1559                 goto bad_packet;
1560         if (!pskb_may_pull(skb, th->doff * 4))
1561                 goto discard_it;
1562
1563         /* An explanation is required here, I think.
1564          * Packet length and doff are validated by header prediction,
1565          * provided case of th->doff==0 is eliminated.
1566          * So, we defer the checks. */
1567
1568         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1569                 goto csum_error;
1570
1571         th = tcp_hdr(skb);
1572         iph = ip_hdr(skb);
1573         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1574          * barrier() makes sure compiler wont play fool^Waliasing games.
1575          */
1576         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1577                 sizeof(struct inet_skb_parm));
1578         barrier();
1579
1580         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1581         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1582                                     skb->len - th->doff * 4);
1583         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1584         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1585         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1586         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1587         TCP_SKB_CB(skb)->sacked  = 0;
1588
1589 lookup:
1590         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1591         if (!sk)
1592                 goto no_tcp_socket;
1593
1594 process:
1595         if (sk->sk_state == TCP_TIME_WAIT)
1596                 goto do_time_wait;
1597
1598         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1599                 struct request_sock *req = inet_reqsk(sk);
1600                 struct sock *nsk = NULL;
1601
1602                 sk = req->rsk_listener;
1603                 if (tcp_v4_inbound_md5_hash(sk, skb))
1604                         goto discard_and_relse;
1605                 if (likely(sk->sk_state == TCP_LISTEN)) {
1606                         nsk = tcp_check_req(sk, skb, req, false);
1607                 } else {
1608                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1609                         goto lookup;
1610                 }
1611                 if (!nsk) {
1612                         reqsk_put(req);
1613                         goto discard_it;
1614                 }
1615                 if (nsk == sk) {
1616                         sock_hold(sk);
1617                         reqsk_put(req);
1618                 } else if (tcp_child_process(sk, nsk, skb)) {
1619                         tcp_v4_send_reset(nsk, skb);
1620                         goto discard_it;
1621                 } else {
1622                         return 0;
1623                 }
1624         }
1625         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1626                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1627                 goto discard_and_relse;
1628         }
1629
1630         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1631                 goto discard_and_relse;
1632
1633         if (tcp_v4_inbound_md5_hash(sk, skb))
1634                 goto discard_and_relse;
1635
1636         nf_reset(skb);
1637
1638         if (sk_filter(sk, skb))
1639                 goto discard_and_relse;
1640
1641         skb->dev = NULL;
1642
1643         if (sk->sk_state == TCP_LISTEN) {
1644                 ret = tcp_v4_do_rcv(sk, skb);
1645                 goto put_and_return;
1646         }
1647
1648         sk_incoming_cpu_update(sk);
1649
1650         bh_lock_sock_nested(sk);
1651         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1652         ret = 0;
1653         if (!sock_owned_by_user(sk)) {
1654                 if (!tcp_prequeue(sk, skb))
1655                         ret = tcp_v4_do_rcv(sk, skb);
1656         } else if (unlikely(sk_add_backlog(sk, skb,
1657                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1658                 bh_unlock_sock(sk);
1659                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1660                 goto discard_and_relse;
1661         }
1662         bh_unlock_sock(sk);
1663
1664 put_and_return:
1665         sock_put(sk);
1666
1667         return ret;
1668
1669 no_tcp_socket:
1670         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1671                 goto discard_it;
1672
1673         if (tcp_checksum_complete(skb)) {
1674 csum_error:
1675                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1676 bad_packet:
1677                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1678         } else {
1679                 tcp_v4_send_reset(NULL, skb);
1680         }
1681
1682 discard_it:
1683         /* Discard frame. */
1684         kfree_skb(skb);
1685         return 0;
1686
1687 discard_and_relse:
1688         sock_put(sk);
1689         goto discard_it;
1690
1691 do_time_wait:
1692         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1693                 inet_twsk_put(inet_twsk(sk));
1694                 goto discard_it;
1695         }
1696
1697         if (tcp_checksum_complete(skb)) {
1698                 inet_twsk_put(inet_twsk(sk));
1699                 goto csum_error;
1700         }
1701         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1702         case TCP_TW_SYN: {
1703                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1704                                                         &tcp_hashinfo,
1705                                                         iph->saddr, th->source,
1706                                                         iph->daddr, th->dest,
1707                                                         inet_iif(skb));
1708                 if (sk2) {
1709                         inet_twsk_deschedule_put(inet_twsk(sk));
1710                         sk = sk2;
1711                         goto process;
1712                 }
1713                 /* Fall through to ACK */
1714         }
1715         case TCP_TW_ACK:
1716                 tcp_v4_timewait_ack(sk, skb);
1717                 break;
1718         case TCP_TW_RST:
1719                 tcp_v4_send_reset(sk, skb);
1720                 inet_twsk_deschedule_put(inet_twsk(sk));
1721                 goto discard_it;
1722         case TCP_TW_SUCCESS:;
1723         }
1724         goto discard_it;
1725 }
1726
1727 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1728         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1729         .twsk_unique    = tcp_twsk_unique,
1730         .twsk_destructor= tcp_twsk_destructor,
1731 };
1732
1733 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1734 {
1735         struct dst_entry *dst = skb_dst(skb);
1736
1737         if (dst && dst_hold_safe(dst)) {
1738                 sk->sk_rx_dst = dst;
1739                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1740         }
1741 }
1742 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1743
1744 const struct inet_connection_sock_af_ops ipv4_specific = {
1745         .queue_xmit        = ip_queue_xmit,
1746         .send_check        = tcp_v4_send_check,
1747         .rebuild_header    = inet_sk_rebuild_header,
1748         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1749         .conn_request      = tcp_v4_conn_request,
1750         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1751         .net_header_len    = sizeof(struct iphdr),
1752         .setsockopt        = ip_setsockopt,
1753         .getsockopt        = ip_getsockopt,
1754         .addr2sockaddr     = inet_csk_addr2sockaddr,
1755         .sockaddr_len      = sizeof(struct sockaddr_in),
1756         .bind_conflict     = inet_csk_bind_conflict,
1757 #ifdef CONFIG_COMPAT
1758         .compat_setsockopt = compat_ip_setsockopt,
1759         .compat_getsockopt = compat_ip_getsockopt,
1760 #endif
1761         .mtu_reduced       = tcp_v4_mtu_reduced,
1762 };
1763 EXPORT_SYMBOL(ipv4_specific);
1764
1765 #ifdef CONFIG_TCP_MD5SIG
1766 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1767         .md5_lookup             = tcp_v4_md5_lookup,
1768         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1769         .md5_parse              = tcp_v4_parse_md5_keys,
1770 };
1771 #endif
1772
1773 /* NOTE: A lot of things set to zero explicitly by call to
1774  *       sk_alloc() so need not be done here.
1775  */
1776 static int tcp_v4_init_sock(struct sock *sk)
1777 {
1778         struct inet_connection_sock *icsk = inet_csk(sk);
1779
1780         tcp_init_sock(sk);
1781
1782         icsk->icsk_af_ops = &ipv4_specific;
1783
1784 #ifdef CONFIG_TCP_MD5SIG
1785         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1786 #endif
1787
1788         return 0;
1789 }
1790
1791 void tcp_v4_destroy_sock(struct sock *sk)
1792 {
1793         struct tcp_sock *tp = tcp_sk(sk);
1794
1795         tcp_clear_xmit_timers(sk);
1796
1797         tcp_cleanup_congestion_control(sk);
1798
1799         /* Cleanup up the write buffer. */
1800         tcp_write_queue_purge(sk);
1801
1802         /* Cleans up our, hopefully empty, out_of_order_queue. */
1803         __skb_queue_purge(&tp->out_of_order_queue);
1804
1805 #ifdef CONFIG_TCP_MD5SIG
1806         /* Clean up the MD5 key list, if any */
1807         if (tp->md5sig_info) {
1808                 tcp_clear_md5_list(sk);
1809                 kfree_rcu(tp->md5sig_info, rcu);
1810                 tp->md5sig_info = NULL;
1811         }
1812 #endif
1813
1814         /* Clean prequeue, it must be empty really */
1815         __skb_queue_purge(&tp->ucopy.prequeue);
1816
1817         /* Clean up a referenced TCP bind bucket. */
1818         if (inet_csk(sk)->icsk_bind_hash)
1819                 inet_put_port(sk);
1820
1821         BUG_ON(tp->fastopen_rsk);
1822
1823         /* If socket is aborted during connect operation */
1824         tcp_free_fastopen_req(tp);
1825         tcp_saved_syn_free(tp);
1826
1827         sk_sockets_allocated_dec(sk);
1828
1829         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1830                 sock_release_memcg(sk);
1831 }
1832 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1833
1834 #ifdef CONFIG_PROC_FS
1835 /* Proc filesystem TCP sock list dumping. */
1836
1837 /*
1838  * Get next listener socket follow cur.  If cur is NULL, get first socket
1839  * starting from bucket given in st->bucket; when st->bucket is zero the
1840  * very first socket in the hash table is returned.
1841  */
1842 static void *listening_get_next(struct seq_file *seq, void *cur)
1843 {
1844         struct inet_connection_sock *icsk;
1845         struct hlist_nulls_node *node;
1846         struct sock *sk = cur;
1847         struct inet_listen_hashbucket *ilb;
1848         struct tcp_iter_state *st = seq->private;
1849         struct net *net = seq_file_net(seq);
1850
1851         if (!sk) {
1852                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1853                 spin_lock_bh(&ilb->lock);
1854                 sk = sk_nulls_head(&ilb->head);
1855                 st->offset = 0;
1856                 goto get_sk;
1857         }
1858         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1859         ++st->num;
1860         ++st->offset;
1861
1862         sk = sk_nulls_next(sk);
1863 get_sk:
1864         sk_nulls_for_each_from(sk, node) {
1865                 if (!net_eq(sock_net(sk), net))
1866                         continue;
1867                 if (sk->sk_family == st->family) {
1868                         cur = sk;
1869                         goto out;
1870                 }
1871                 icsk = inet_csk(sk);
1872         }
1873         spin_unlock_bh(&ilb->lock);
1874         st->offset = 0;
1875         if (++st->bucket < INET_LHTABLE_SIZE) {
1876                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1877                 spin_lock_bh(&ilb->lock);
1878                 sk = sk_nulls_head(&ilb->head);
1879                 goto get_sk;
1880         }
1881         cur = NULL;
1882 out:
1883         return cur;
1884 }
1885
1886 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1887 {
1888         struct tcp_iter_state *st = seq->private;
1889         void *rc;
1890
1891         st->bucket = 0;
1892         st->offset = 0;
1893         rc = listening_get_next(seq, NULL);
1894
1895         while (rc && *pos) {
1896                 rc = listening_get_next(seq, rc);
1897                 --*pos;
1898         }
1899         return rc;
1900 }
1901
1902 static inline bool empty_bucket(const struct tcp_iter_state *st)
1903 {
1904         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1905 }
1906
1907 /*
1908  * Get first established socket starting from bucket given in st->bucket.
1909  * If st->bucket is zero, the very first socket in the hash is returned.
1910  */
1911 static void *established_get_first(struct seq_file *seq)
1912 {
1913         struct tcp_iter_state *st = seq->private;
1914         struct net *net = seq_file_net(seq);
1915         void *rc = NULL;
1916
1917         st->offset = 0;
1918         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1919                 struct sock *sk;
1920                 struct hlist_nulls_node *node;
1921                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1922
1923                 /* Lockless fast path for the common case of empty buckets */
1924                 if (empty_bucket(st))
1925                         continue;
1926
1927                 spin_lock_bh(lock);
1928                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1929                         if (sk->sk_family != st->family ||
1930                             !net_eq(sock_net(sk), net)) {
1931                                 continue;
1932                         }
1933                         rc = sk;
1934                         goto out;
1935                 }
1936                 spin_unlock_bh(lock);
1937         }
1938 out:
1939         return rc;
1940 }
1941
1942 static void *established_get_next(struct seq_file *seq, void *cur)
1943 {
1944         struct sock *sk = cur;
1945         struct hlist_nulls_node *node;
1946         struct tcp_iter_state *st = seq->private;
1947         struct net *net = seq_file_net(seq);
1948
1949         ++st->num;
1950         ++st->offset;
1951
1952         sk = sk_nulls_next(sk);
1953
1954         sk_nulls_for_each_from(sk, node) {
1955                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1956                         return sk;
1957         }
1958
1959         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1960         ++st->bucket;
1961         return established_get_first(seq);
1962 }
1963
1964 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1965 {
1966         struct tcp_iter_state *st = seq->private;
1967         void *rc;
1968
1969         st->bucket = 0;
1970         rc = established_get_first(seq);
1971
1972         while (rc && pos) {
1973                 rc = established_get_next(seq, rc);
1974                 --pos;
1975         }
1976         return rc;
1977 }
1978
1979 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1980 {
1981         void *rc;
1982         struct tcp_iter_state *st = seq->private;
1983
1984         st->state = TCP_SEQ_STATE_LISTENING;
1985         rc        = listening_get_idx(seq, &pos);
1986
1987         if (!rc) {
1988                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1989                 rc        = established_get_idx(seq, pos);
1990         }
1991
1992         return rc;
1993 }
1994
1995 static void *tcp_seek_last_pos(struct seq_file *seq)
1996 {
1997         struct tcp_iter_state *st = seq->private;
1998         int offset = st->offset;
1999         int orig_num = st->num;
2000         void *rc = NULL;
2001
2002         switch (st->state) {
2003         case TCP_SEQ_STATE_LISTENING:
2004                 if (st->bucket >= INET_LHTABLE_SIZE)
2005                         break;
2006                 st->state = TCP_SEQ_STATE_LISTENING;
2007                 rc = listening_get_next(seq, NULL);
2008                 while (offset-- && rc)
2009                         rc = listening_get_next(seq, rc);
2010                 if (rc)
2011                         break;
2012                 st->bucket = 0;
2013                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2014                 /* Fallthrough */
2015         case TCP_SEQ_STATE_ESTABLISHED:
2016                 if (st->bucket > tcp_hashinfo.ehash_mask)
2017                         break;
2018                 rc = established_get_first(seq);
2019                 while (offset-- && rc)
2020                         rc = established_get_next(seq, rc);
2021         }
2022
2023         st->num = orig_num;
2024
2025         return rc;
2026 }
2027
2028 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2029 {
2030         struct tcp_iter_state *st = seq->private;
2031         void *rc;
2032
2033         if (*pos && *pos == st->last_pos) {
2034                 rc = tcp_seek_last_pos(seq);
2035                 if (rc)
2036                         goto out;
2037         }
2038
2039         st->state = TCP_SEQ_STATE_LISTENING;
2040         st->num = 0;
2041         st->bucket = 0;
2042         st->offset = 0;
2043         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2044
2045 out:
2046         st->last_pos = *pos;
2047         return rc;
2048 }
2049
2050 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2051 {
2052         struct tcp_iter_state *st = seq->private;
2053         void *rc = NULL;
2054
2055         if (v == SEQ_START_TOKEN) {
2056                 rc = tcp_get_idx(seq, 0);
2057                 goto out;
2058         }
2059
2060         switch (st->state) {
2061         case TCP_SEQ_STATE_LISTENING:
2062                 rc = listening_get_next(seq, v);
2063                 if (!rc) {
2064                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2065                         st->bucket = 0;
2066                         st->offset = 0;
2067                         rc        = established_get_first(seq);
2068                 }
2069                 break;
2070         case TCP_SEQ_STATE_ESTABLISHED:
2071                 rc = established_get_next(seq, v);
2072                 break;
2073         }
2074 out:
2075         ++*pos;
2076         st->last_pos = *pos;
2077         return rc;
2078 }
2079
2080 static void tcp_seq_stop(struct seq_file *seq, void *v)
2081 {
2082         struct tcp_iter_state *st = seq->private;
2083
2084         switch (st->state) {
2085         case TCP_SEQ_STATE_LISTENING:
2086                 if (v != SEQ_START_TOKEN)
2087                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2088                 break;
2089         case TCP_SEQ_STATE_ESTABLISHED:
2090                 if (v)
2091                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2092                 break;
2093         }
2094 }
2095
2096 int tcp_seq_open(struct inode *inode, struct file *file)
2097 {
2098         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2099         struct tcp_iter_state *s;
2100         int err;
2101
2102         err = seq_open_net(inode, file, &afinfo->seq_ops,
2103                           sizeof(struct tcp_iter_state));
2104         if (err < 0)
2105                 return err;
2106
2107         s = ((struct seq_file *)file->private_data)->private;
2108         s->family               = afinfo->family;
2109         s->last_pos             = 0;
2110         return 0;
2111 }
2112 EXPORT_SYMBOL(tcp_seq_open);
2113
2114 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2115 {
2116         int rc = 0;
2117         struct proc_dir_entry *p;
2118
2119         afinfo->seq_ops.start           = tcp_seq_start;
2120         afinfo->seq_ops.next            = tcp_seq_next;
2121         afinfo->seq_ops.stop            = tcp_seq_stop;
2122
2123         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2124                              afinfo->seq_fops, afinfo);
2125         if (!p)
2126                 rc = -ENOMEM;
2127         return rc;
2128 }
2129 EXPORT_SYMBOL(tcp_proc_register);
2130
2131 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2132 {
2133         remove_proc_entry(afinfo->name, net->proc_net);
2134 }
2135 EXPORT_SYMBOL(tcp_proc_unregister);
2136
2137 static void get_openreq4(const struct request_sock *req,
2138                          struct seq_file *f, int i)
2139 {
2140         const struct inet_request_sock *ireq = inet_rsk(req);
2141         long delta = req->rsk_timer.expires - jiffies;
2142
2143         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2144                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2145                 i,
2146                 ireq->ir_loc_addr,
2147                 ireq->ir_num,
2148                 ireq->ir_rmt_addr,
2149                 ntohs(ireq->ir_rmt_port),
2150                 TCP_SYN_RECV,
2151                 0, 0, /* could print option size, but that is af dependent. */
2152                 1,    /* timers active (only the expire timer) */
2153                 jiffies_delta_to_clock_t(delta),
2154                 req->num_timeout,
2155                 from_kuid_munged(seq_user_ns(f),
2156                                  sock_i_uid(req->rsk_listener)),
2157                 0,  /* non standard timer */
2158                 0, /* open_requests have no inode */
2159                 0,
2160                 req);
2161 }
2162
2163 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2164 {
2165         int timer_active;
2166         unsigned long timer_expires;
2167         const struct tcp_sock *tp = tcp_sk(sk);
2168         const struct inet_connection_sock *icsk = inet_csk(sk);
2169         const struct inet_sock *inet = inet_sk(sk);
2170         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2171         __be32 dest = inet->inet_daddr;
2172         __be32 src = inet->inet_rcv_saddr;
2173         __u16 destp = ntohs(inet->inet_dport);
2174         __u16 srcp = ntohs(inet->inet_sport);
2175         int rx_queue;
2176         int state;
2177
2178         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2179             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2180             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2181                 timer_active    = 1;
2182                 timer_expires   = icsk->icsk_timeout;
2183         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2184                 timer_active    = 4;
2185                 timer_expires   = icsk->icsk_timeout;
2186         } else if (timer_pending(&sk->sk_timer)) {
2187                 timer_active    = 2;
2188                 timer_expires   = sk->sk_timer.expires;
2189         } else {
2190                 timer_active    = 0;
2191                 timer_expires = jiffies;
2192         }
2193
2194         state = sk_state_load(sk);
2195         if (state == TCP_LISTEN)
2196                 rx_queue = sk->sk_ack_backlog;
2197         else
2198                 /* Because we don't lock the socket,
2199                  * we might find a transient negative value.
2200                  */
2201                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2202
2203         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2204                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2205                 i, src, srcp, dest, destp, state,
2206                 tp->write_seq - tp->snd_una,
2207                 rx_queue,
2208                 timer_active,
2209                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2210                 icsk->icsk_retransmits,
2211                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2212                 icsk->icsk_probes_out,
2213                 sock_i_ino(sk),
2214                 atomic_read(&sk->sk_refcnt), sk,
2215                 jiffies_to_clock_t(icsk->icsk_rto),
2216                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2217                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2218                 tp->snd_cwnd,
2219                 state == TCP_LISTEN ?
2220                     fastopenq->max_qlen :
2221                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2222 }
2223
2224 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2225                                struct seq_file *f, int i)
2226 {
2227         long delta = tw->tw_timer.expires - jiffies;
2228         __be32 dest, src;
2229         __u16 destp, srcp;
2230
2231         dest  = tw->tw_daddr;
2232         src   = tw->tw_rcv_saddr;
2233         destp = ntohs(tw->tw_dport);
2234         srcp  = ntohs(tw->tw_sport);
2235
2236         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2237                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2238                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2239                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2240                 atomic_read(&tw->tw_refcnt), tw);
2241 }
2242
2243 #define TMPSZ 150
2244
2245 static int tcp4_seq_show(struct seq_file *seq, void *v)
2246 {
2247         struct tcp_iter_state *st;
2248         struct sock *sk = v;
2249
2250         seq_setwidth(seq, TMPSZ - 1);
2251         if (v == SEQ_START_TOKEN) {
2252                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2253                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2254                            "inode");
2255                 goto out;
2256         }
2257         st = seq->private;
2258
2259         if (sk->sk_state == TCP_TIME_WAIT)
2260                 get_timewait4_sock(v, seq, st->num);
2261         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2262                 get_openreq4(v, seq, st->num);
2263         else
2264                 get_tcp4_sock(v, seq, st->num);
2265 out:
2266         seq_pad(seq, '\n');
2267         return 0;
2268 }
2269
2270 static const struct file_operations tcp_afinfo_seq_fops = {
2271         .owner   = THIS_MODULE,
2272         .open    = tcp_seq_open,
2273         .read    = seq_read,
2274         .llseek  = seq_lseek,
2275         .release = seq_release_net
2276 };
2277
2278 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2279         .name           = "tcp",
2280         .family         = AF_INET,
2281         .seq_fops       = &tcp_afinfo_seq_fops,
2282         .seq_ops        = {
2283                 .show           = tcp4_seq_show,
2284         },
2285 };
2286
2287 static int __net_init tcp4_proc_init_net(struct net *net)
2288 {
2289         return tcp_proc_register(net, &tcp4_seq_afinfo);
2290 }
2291
2292 static void __net_exit tcp4_proc_exit_net(struct net *net)
2293 {
2294         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2295 }
2296
2297 static struct pernet_operations tcp4_net_ops = {
2298         .init = tcp4_proc_init_net,
2299         .exit = tcp4_proc_exit_net,
2300 };
2301
2302 int __init tcp4_proc_init(void)
2303 {
2304         return register_pernet_subsys(&tcp4_net_ops);
2305 }
2306
2307 void tcp4_proc_exit(void)
2308 {
2309         unregister_pernet_subsys(&tcp4_net_ops);
2310 }
2311 #endif /* CONFIG_PROC_FS */
2312
2313 struct proto tcp_prot = {
2314         .name                   = "TCP",
2315         .owner                  = THIS_MODULE,
2316         .close                  = tcp_close,
2317         .connect                = tcp_v4_connect,
2318         .disconnect             = tcp_disconnect,
2319         .accept                 = inet_csk_accept,
2320         .ioctl                  = tcp_ioctl,
2321         .init                   = tcp_v4_init_sock,
2322         .destroy                = tcp_v4_destroy_sock,
2323         .shutdown               = tcp_shutdown,
2324         .setsockopt             = tcp_setsockopt,
2325         .getsockopt             = tcp_getsockopt,
2326         .recvmsg                = tcp_recvmsg,
2327         .sendmsg                = tcp_sendmsg,
2328         .sendpage               = tcp_sendpage,
2329         .backlog_rcv            = tcp_v4_do_rcv,
2330         .release_cb             = tcp_release_cb,
2331         .hash                   = inet_hash,
2332         .unhash                 = inet_unhash,
2333         .get_port               = inet_csk_get_port,
2334         .enter_memory_pressure  = tcp_enter_memory_pressure,
2335         .stream_memory_free     = tcp_stream_memory_free,
2336         .sockets_allocated      = &tcp_sockets_allocated,
2337         .orphan_count           = &tcp_orphan_count,
2338         .memory_allocated       = &tcp_memory_allocated,
2339         .memory_pressure        = &tcp_memory_pressure,
2340         .sysctl_mem             = sysctl_tcp_mem,
2341         .sysctl_wmem            = sysctl_tcp_wmem,
2342         .sysctl_rmem            = sysctl_tcp_rmem,
2343         .max_header             = MAX_TCP_HEADER,
2344         .obj_size               = sizeof(struct tcp_sock),
2345         .slab_flags             = SLAB_DESTROY_BY_RCU,
2346         .twsk_prot              = &tcp_timewait_sock_ops,
2347         .rsk_prot               = &tcp_request_sock_ops,
2348         .h.hashinfo             = &tcp_hashinfo,
2349         .no_autobind            = true,
2350 #ifdef CONFIG_COMPAT
2351         .compat_setsockopt      = compat_tcp_setsockopt,
2352         .compat_getsockopt      = compat_tcp_getsockopt,
2353 #endif
2354         .diag_destroy           = tcp_abort,
2355 };
2356 EXPORT_SYMBOL(tcp_prot);
2357
2358 static void __net_exit tcp_sk_exit(struct net *net)
2359 {
2360         int cpu;
2361
2362         for_each_possible_cpu(cpu)
2363                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2364         free_percpu(net->ipv4.tcp_sk);
2365 }
2366
2367 static int __net_init tcp_sk_init(struct net *net)
2368 {
2369         int res, cpu;
2370
2371         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2372         if (!net->ipv4.tcp_sk)
2373                 return -ENOMEM;
2374
2375         for_each_possible_cpu(cpu) {
2376                 struct sock *sk;
2377
2378                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2379                                            IPPROTO_TCP, net);
2380                 if (res)
2381                         goto fail;
2382                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2383         }
2384
2385         net->ipv4.sysctl_tcp_ecn = 2;
2386         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2387
2388         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2389         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2390         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2391
2392         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2393         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2394         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2395
2396         return 0;
2397 fail:
2398         tcp_sk_exit(net);
2399
2400         return res;
2401 }
2402
2403 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2404 {
2405         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2406 }
2407
2408 static struct pernet_operations __net_initdata tcp_sk_ops = {
2409        .init       = tcp_sk_init,
2410        .exit       = tcp_sk_exit,
2411        .exit_batch = tcp_sk_exit_batch,
2412 };
2413
2414 void __init tcp_v4_init(void)
2415 {
2416         inet_hashinfo_init(&tcp_hashinfo);
2417         if (register_pernet_subsys(&tcp_sk_ops))
2418                 panic("Failed to create the TCP control socket.\n");
2419 }