]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/tcp_ipv4.c
usb: chipidea: udc: remove unused value assignment
[karo-tx-linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95
96 struct inet_hashinfo tcp_hashinfo;
97 EXPORT_SYMBOL(tcp_hashinfo);
98
99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
100 {
101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102                                           ip_hdr(skb)->saddr,
103                                           tcp_hdr(skb)->dest,
104                                           tcp_hdr(skb)->source);
105 }
106
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110         struct tcp_sock *tp = tcp_sk(sk);
111
112         /* With PAWS, it is safe from the viewpoint
113            of data integrity. Even without PAWS it is safe provided sequence
114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116            Actually, the idea is close to VJ's one, only timestamp cache is
117            held not per host, but per port pair and TW bucket is used as state
118            holder.
119
120            If TW bucket has been already destroyed we fall back to VJ's scheme
121            and use initial timestamp retrieved from peer table.
122          */
123         if (tcptw->tw_ts_recent_stamp &&
124             (!twp || (sysctl_tcp_tw_reuse &&
125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127                 if (tp->write_seq == 0)
128                         tp->write_seq = 1;
129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131                 sock_hold(sktw);
132                 return 1;
133         }
134
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
139 /* This will initiate an outgoing connection. */
140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141 {
142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
143         struct inet_sock *inet = inet_sk(sk);
144         struct tcp_sock *tp = tcp_sk(sk);
145         __be16 orig_sport, orig_dport;
146         __be32 daddr, nexthop;
147         struct flowi4 *fl4;
148         struct rtable *rt;
149         int err;
150         struct ip_options_rcu *inet_opt;
151
152         if (addr_len < sizeof(struct sockaddr_in))
153                 return -EINVAL;
154
155         if (usin->sin_family != AF_INET)
156                 return -EAFNOSUPPORT;
157
158         nexthop = daddr = usin->sin_addr.s_addr;
159         inet_opt = rcu_dereference_protected(inet->inet_opt,
160                                              sock_owned_by_user(sk));
161         if (inet_opt && inet_opt->opt.srr) {
162                 if (!daddr)
163                         return -EINVAL;
164                 nexthop = inet_opt->opt.faddr;
165         }
166
167         orig_sport = inet->inet_sport;
168         orig_dport = usin->sin_port;
169         fl4 = &inet->cork.fl.u.ip4;
170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172                               IPPROTO_TCP,
173                               orig_sport, orig_dport, sk);
174         if (IS_ERR(rt)) {
175                 err = PTR_ERR(rt);
176                 if (err == -ENETUNREACH)
177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return err;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet_opt || !inet_opt->opt.srr)
187                 daddr = fl4->daddr;
188
189         if (!inet->inet_saddr)
190                 inet->inet_saddr = fl4->saddr;
191         sk_rcv_saddr_set(sk, inet->inet_saddr);
192
193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 if (likely(!tp->repair))
198                         tp->write_seq      = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
204
205         inet->inet_dport = usin->sin_port;
206         sk_daddr_set(sk, daddr);
207
208         inet_csk(sk)->icsk_ext_hdr_len = 0;
209         if (inet_opt)
210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
211
212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
213
214         /* Socket identity is still unknown (sport may be zero).
215          * However we set state to SYN-SENT and not releasing socket
216          * lock select source port, enter ourselves into the hash tables and
217          * complete initialization after this.
218          */
219         tcp_set_state(sk, TCP_SYN_SENT);
220         err = inet_hash_connect(&tcp_death_row, sk);
221         if (err)
222                 goto failure;
223
224         sk_set_txhash(sk);
225
226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
227                                inet->inet_sport, inet->inet_dport, sk);
228         if (IS_ERR(rt)) {
229                 err = PTR_ERR(rt);
230                 rt = NULL;
231                 goto failure;
232         }
233         /* OK, now commit destination to socket.  */
234         sk->sk_gso_type = SKB_GSO_TCPV4;
235         sk_setup_caps(sk, &rt->dst);
236
237         if (!tp->write_seq && likely(!tp->repair))
238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239                                                            inet->inet_daddr,
240                                                            inet->inet_sport,
241                                                            usin->sin_port);
242
243         inet->inet_id = tp->write_seq ^ jiffies;
244
245         err = tcp_connect(sk);
246
247         rt = NULL;
248         if (err)
249                 goto failure;
250
251         return 0;
252
253 failure:
254         /*
255          * This unhashes the socket and releases the local port,
256          * if necessary.
257          */
258         tcp_set_state(sk, TCP_CLOSE);
259         ip_rt_put(rt);
260         sk->sk_route_caps = 0;
261         inet->inet_dport = 0;
262         return err;
263 }
264 EXPORT_SYMBOL(tcp_v4_connect);
265
266 /*
267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268  * It can be called through tcp_release_cb() if socket was owned by user
269  * at the time tcp_v4_err() was called to handle ICMP message.
270  */
271 void tcp_v4_mtu_reduced(struct sock *sk)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275         u32 mtu = tcp_sk(sk)->mtu_info;
276
277         dst = inet_csk_update_pmtu(sk, mtu);
278         if (!dst)
279                 return;
280
281         /* Something is about to be wrong... Remember soft error
282          * for the case, if this connection will not able to recover.
283          */
284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285                 sk->sk_err_soft = EMSGSIZE;
286
287         mtu = dst_mtu(dst);
288
289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
290             ip_sk_accept_pmtu(sk) &&
291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
292                 tcp_sync_mss(sk, mtu);
293
294                 /* Resend the TCP packet because it's
295                  * clear that the old packet has been
296                  * dropped. This is the new "fast" path mtu
297                  * discovery.
298                  */
299                 tcp_simple_retransmit(sk);
300         } /* else let the usual retransmit timer handle it */
301 }
302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
303
304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
305 {
306         struct dst_entry *dst = __sk_dst_check(sk, 0);
307
308         if (dst)
309                 dst->ops->redirect(dst, sk, skb);
310 }
311
312
313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314 void tcp_req_err(struct sock *sk, u32 seq)
315 {
316         struct request_sock *req = inet_reqsk(sk);
317         struct net *net = sock_net(sk);
318
319         /* ICMPs are not backlogged, hence we cannot get
320          * an established socket here.
321          */
322         WARN_ON(req->sk);
323
324         if (seq != tcp_rsk(req)->snt_isn) {
325                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326         } else {
327                 /*
328                  * Still in SYN_RECV, just remove it silently.
329                  * There is no good way to pass the error to the newly
330                  * created socket, and POSIX does not want network
331                  * errors returned from accept().
332                  */
333                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335         }
336         reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360         struct inet_connection_sock *icsk;
361         struct tcp_sock *tp;
362         struct inet_sock *inet;
363         const int type = icmp_hdr(icmp_skb)->type;
364         const int code = icmp_hdr(icmp_skb)->code;
365         struct sock *sk;
366         struct sk_buff *skb;
367         struct request_sock *fastopen;
368         __u32 seq, snd_una;
369         __u32 remaining;
370         int err;
371         struct net *net = dev_net(icmp_skb->dev);
372
373         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374                                        th->dest, iph->saddr, ntohs(th->source),
375                                        inet_iif(icmp_skb));
376         if (!sk) {
377                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378                 return;
379         }
380         if (sk->sk_state == TCP_TIME_WAIT) {
381                 inet_twsk_put(inet_twsk(sk));
382                 return;
383         }
384         seq = ntohl(th->seq);
385         if (sk->sk_state == TCP_NEW_SYN_RECV)
386                 return tcp_req_err(sk, seq);
387
388         bh_lock_sock(sk);
389         /* If too many ICMPs get dropped on busy
390          * servers this needs to be solved differently.
391          * We do take care of PMTU discovery (RFC1191) special case :
392          * we can receive locally generated ICMP messages while socket is held.
393          */
394         if (sock_owned_by_user(sk)) {
395                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
396                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
397         }
398         if (sk->sk_state == TCP_CLOSE)
399                 goto out;
400
401         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
402                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
403                 goto out;
404         }
405
406         icsk = inet_csk(sk);
407         tp = tcp_sk(sk);
408         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
409         fastopen = tp->fastopen_rsk;
410         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
411         if (sk->sk_state != TCP_LISTEN &&
412             !between(seq, snd_una, tp->snd_nxt)) {
413                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
414                 goto out;
415         }
416
417         switch (type) {
418         case ICMP_REDIRECT:
419                 do_redirect(icmp_skb, sk);
420                 goto out;
421         case ICMP_SOURCE_QUENCH:
422                 /* Just silently ignore these. */
423                 goto out;
424         case ICMP_PARAMETERPROB:
425                 err = EPROTO;
426                 break;
427         case ICMP_DEST_UNREACH:
428                 if (code > NR_ICMP_UNREACH)
429                         goto out;
430
431                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
432                         /* We are not interested in TCP_LISTEN and open_requests
433                          * (SYN-ACKs send out by Linux are always <576bytes so
434                          * they should go through unfragmented).
435                          */
436                         if (sk->sk_state == TCP_LISTEN)
437                                 goto out;
438
439                         tp->mtu_info = info;
440                         if (!sock_owned_by_user(sk)) {
441                                 tcp_v4_mtu_reduced(sk);
442                         } else {
443                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
444                                         sock_hold(sk);
445                         }
446                         goto out;
447                 }
448
449                 err = icmp_err_convert[code].errno;
450                 /* check if icmp_skb allows revert of backoff
451                  * (see draft-zimmermann-tcp-lcd) */
452                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
453                         break;
454                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455                     !icsk->icsk_backoff || fastopen)
456                         break;
457
458                 if (sock_owned_by_user(sk))
459                         break;
460
461                 icsk->icsk_backoff--;
462                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
463                                                TCP_TIMEOUT_INIT;
464                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
465
466                 skb = tcp_write_queue_head(sk);
467                 BUG_ON(!skb);
468
469                 remaining = icsk->icsk_rto -
470                             min(icsk->icsk_rto,
471                                 tcp_time_stamp - tcp_skb_timestamp(skb));
472
473                 if (remaining) {
474                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
475                                                   remaining, TCP_RTO_MAX);
476                 } else {
477                         /* RTO revert clocked out retransmission.
478                          * Will retransmit now */
479                         tcp_retransmit_timer(sk);
480                 }
481
482                 break;
483         case ICMP_TIME_EXCEEDED:
484                 err = EHOSTUNREACH;
485                 break;
486         default:
487                 goto out;
488         }
489
490         switch (sk->sk_state) {
491         case TCP_SYN_SENT:
492         case TCP_SYN_RECV:
493                 /* Only in fast or simultaneous open. If a fast open socket is
494                  * is already accepted it is treated as a connected one below.
495                  */
496                 if (fastopen && !fastopen->sk)
497                         break;
498
499                 if (!sock_owned_by_user(sk)) {
500                         sk->sk_err = err;
501
502                         sk->sk_error_report(sk);
503
504                         tcp_done(sk);
505                 } else {
506                         sk->sk_err_soft = err;
507                 }
508                 goto out;
509         }
510
511         /* If we've already connected we will keep trying
512          * until we time out, or the user gives up.
513          *
514          * rfc1122 4.2.3.9 allows to consider as hard errors
515          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516          * but it is obsoleted by pmtu discovery).
517          *
518          * Note, that in modern internet, where routing is unreliable
519          * and in each dark corner broken firewalls sit, sending random
520          * errors ordered by their masters even this two messages finally lose
521          * their original sense (even Linux sends invalid PORT_UNREACHs)
522          *
523          * Now we are in compliance with RFCs.
524          *                                                      --ANK (980905)
525          */
526
527         inet = inet_sk(sk);
528         if (!sock_owned_by_user(sk) && inet->recverr) {
529                 sk->sk_err = err;
530                 sk->sk_error_report(sk);
531         } else  { /* Only an error on timeout */
532                 sk->sk_err_soft = err;
533         }
534
535 out:
536         bh_unlock_sock(sk);
537         sock_put(sk);
538 }
539
540 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
541 {
542         struct tcphdr *th = tcp_hdr(skb);
543
544         if (skb->ip_summed == CHECKSUM_PARTIAL) {
545                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
546                 skb->csum_start = skb_transport_header(skb) - skb->head;
547                 skb->csum_offset = offsetof(struct tcphdr, check);
548         } else {
549                 th->check = tcp_v4_check(skb->len, saddr, daddr,
550                                          csum_partial(th,
551                                                       th->doff << 2,
552                                                       skb->csum));
553         }
554 }
555
556 /* This routine computes an IPv4 TCP checksum. */
557 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
558 {
559         const struct inet_sock *inet = inet_sk(sk);
560
561         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562 }
563 EXPORT_SYMBOL(tcp_v4_send_check);
564
565 /*
566  *      This routine will send an RST to the other tcp.
567  *
568  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569  *                    for reset.
570  *      Answer: if a packet caused RST, it is not for a socket
571  *              existing in our system, if it is matched to a socket,
572  *              it is just duplicate segment or bug in other side's TCP.
573  *              So that we build reply only basing on parameters
574  *              arrived with segment.
575  *      Exception: precedence violation. We do not implement it in any case.
576  */
577
578 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
579 {
580         const struct tcphdr *th = tcp_hdr(skb);
581         struct {
582                 struct tcphdr th;
583 #ifdef CONFIG_TCP_MD5SIG
584                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
585 #endif
586         } rep;
587         struct ip_reply_arg arg;
588 #ifdef CONFIG_TCP_MD5SIG
589         struct tcp_md5sig_key *key = NULL;
590         const __u8 *hash_location = NULL;
591         unsigned char newhash[16];
592         int genhash;
593         struct sock *sk1 = NULL;
594 #endif
595         struct net *net;
596
597         /* Never send a reset in response to a reset. */
598         if (th->rst)
599                 return;
600
601         /* If sk not NULL, it means we did a successful lookup and incoming
602          * route had to be correct. prequeue might have dropped our dst.
603          */
604         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
605                 return;
606
607         /* Swap the send and the receive. */
608         memset(&rep, 0, sizeof(rep));
609         rep.th.dest   = th->source;
610         rep.th.source = th->dest;
611         rep.th.doff   = sizeof(struct tcphdr) / 4;
612         rep.th.rst    = 1;
613
614         if (th->ack) {
615                 rep.th.seq = th->ack_seq;
616         } else {
617                 rep.th.ack = 1;
618                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619                                        skb->len - (th->doff << 2));
620         }
621
622         memset(&arg, 0, sizeof(arg));
623         arg.iov[0].iov_base = (unsigned char *)&rep;
624         arg.iov[0].iov_len  = sizeof(rep.th);
625
626         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
627 #ifdef CONFIG_TCP_MD5SIG
628         hash_location = tcp_parse_md5sig_option(th);
629         if (sk && sk_fullsock(sk)) {
630                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
631                                         &ip_hdr(skb)->saddr, AF_INET);
632         } else if (hash_location) {
633                 /*
634                  * active side is lost. Try to find listening socket through
635                  * source port, and then find md5 key through listening socket.
636                  * we are not loose security here:
637                  * Incoming packet is checked with md5 hash with finding key,
638                  * no RST generated if md5 hash doesn't match.
639                  */
640                 sk1 = __inet_lookup_listener(net,
641                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
642                                              th->source, ip_hdr(skb)->daddr,
643                                              ntohs(th->source), inet_iif(skb));
644                 /* don't send rst if it can't find key */
645                 if (!sk1)
646                         return;
647                 rcu_read_lock();
648                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
649                                         &ip_hdr(skb)->saddr, AF_INET);
650                 if (!key)
651                         goto release_sk1;
652
653                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
654                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
655                         goto release_sk1;
656         }
657
658         if (key) {
659                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660                                    (TCPOPT_NOP << 16) |
661                                    (TCPOPT_MD5SIG << 8) |
662                                    TCPOLEN_MD5SIG);
663                 /* Update length and the length the header thinks exists */
664                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665                 rep.th.doff = arg.iov[0].iov_len / 4;
666
667                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
668                                      key, ip_hdr(skb)->saddr,
669                                      ip_hdr(skb)->daddr, &rep.th);
670         }
671 #endif
672         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673                                       ip_hdr(skb)->saddr, /* XXX */
674                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
675         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
676         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
677
678         /* When socket is gone, all binding information is lost.
679          * routing might fail in this case. No choice here, if we choose to force
680          * input interface, we will misroute in case of asymmetric route.
681          */
682         if (sk)
683                 arg.bound_dev_if = sk->sk_bound_dev_if;
684
685         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
686                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
687
688         arg.tos = ip_hdr(skb)->tos;
689         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
690                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
691                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
692                               &arg, arg.iov[0].iov_len);
693
694         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
695         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
696
697 #ifdef CONFIG_TCP_MD5SIG
698 release_sk1:
699         if (sk1) {
700                 rcu_read_unlock();
701                 sock_put(sk1);
702         }
703 #endif
704 }
705
706 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
707    outside socket context is ugly, certainly. What can I do?
708  */
709
710 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
711                             u32 win, u32 tsval, u32 tsecr, int oif,
712                             struct tcp_md5sig_key *key,
713                             int reply_flags, u8 tos)
714 {
715         const struct tcphdr *th = tcp_hdr(skb);
716         struct {
717                 struct tcphdr th;
718                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
719 #ifdef CONFIG_TCP_MD5SIG
720                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
721 #endif
722                         ];
723         } rep;
724         struct ip_reply_arg arg;
725         struct net *net = dev_net(skb_dst(skb)->dev);
726
727         memset(&rep.th, 0, sizeof(struct tcphdr));
728         memset(&arg, 0, sizeof(arg));
729
730         arg.iov[0].iov_base = (unsigned char *)&rep;
731         arg.iov[0].iov_len  = sizeof(rep.th);
732         if (tsecr) {
733                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
734                                    (TCPOPT_TIMESTAMP << 8) |
735                                    TCPOLEN_TIMESTAMP);
736                 rep.opt[1] = htonl(tsval);
737                 rep.opt[2] = htonl(tsecr);
738                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
739         }
740
741         /* Swap the send and the receive. */
742         rep.th.dest    = th->source;
743         rep.th.source  = th->dest;
744         rep.th.doff    = arg.iov[0].iov_len / 4;
745         rep.th.seq     = htonl(seq);
746         rep.th.ack_seq = htonl(ack);
747         rep.th.ack     = 1;
748         rep.th.window  = htons(win);
749
750 #ifdef CONFIG_TCP_MD5SIG
751         if (key) {
752                 int offset = (tsecr) ? 3 : 0;
753
754                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
755                                           (TCPOPT_NOP << 16) |
756                                           (TCPOPT_MD5SIG << 8) |
757                                           TCPOLEN_MD5SIG);
758                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
759                 rep.th.doff = arg.iov[0].iov_len/4;
760
761                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
762                                     key, ip_hdr(skb)->saddr,
763                                     ip_hdr(skb)->daddr, &rep.th);
764         }
765 #endif
766         arg.flags = reply_flags;
767         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
768                                       ip_hdr(skb)->saddr, /* XXX */
769                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
770         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
771         if (oif)
772                 arg.bound_dev_if = oif;
773         arg.tos = tos;
774         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
775                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
776                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
777                               &arg, arg.iov[0].iov_len);
778
779         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
780 }
781
782 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
783 {
784         struct inet_timewait_sock *tw = inet_twsk(sk);
785         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
786
787         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
788                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
789                         tcp_time_stamp + tcptw->tw_ts_offset,
790                         tcptw->tw_ts_recent,
791                         tw->tw_bound_dev_if,
792                         tcp_twsk_md5_key(tcptw),
793                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
794                         tw->tw_tos
795                         );
796
797         inet_twsk_put(tw);
798 }
799
800 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
801                                   struct request_sock *req)
802 {
803         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
804          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
805          */
806         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
807                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
808                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
809                         tcp_time_stamp,
810                         req->ts_recent,
811                         0,
812                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
813                                           AF_INET),
814                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
815                         ip_hdr(skb)->tos);
816 }
817
818 /*
819  *      Send a SYN-ACK after having received a SYN.
820  *      This still operates on a request_sock only, not on a big
821  *      socket.
822  */
823 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
824                               struct flowi *fl,
825                               struct request_sock *req,
826                               struct tcp_fastopen_cookie *foc,
827                                   bool attach_req)
828 {
829         const struct inet_request_sock *ireq = inet_rsk(req);
830         struct flowi4 fl4;
831         int err = -1;
832         struct sk_buff *skb;
833
834         /* First, grab a route. */
835         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
836                 return -1;
837
838         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
839
840         if (skb) {
841                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
842
843                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
844                                             ireq->ir_rmt_addr,
845                                             ireq->opt);
846                 err = net_xmit_eval(err);
847         }
848
849         return err;
850 }
851
852 /*
853  *      IPv4 request_sock destructor.
854  */
855 static void tcp_v4_reqsk_destructor(struct request_sock *req)
856 {
857         kfree(inet_rsk(req)->opt);
858 }
859
860
861 #ifdef CONFIG_TCP_MD5SIG
862 /*
863  * RFC2385 MD5 checksumming requires a mapping of
864  * IP address->MD5 Key.
865  * We need to maintain these in the sk structure.
866  */
867
868 /* Find the Key structure for an address.  */
869 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
870                                          const union tcp_md5_addr *addr,
871                                          int family)
872 {
873         const struct tcp_sock *tp = tcp_sk(sk);
874         struct tcp_md5sig_key *key;
875         unsigned int size = sizeof(struct in_addr);
876         const struct tcp_md5sig_info *md5sig;
877
878         /* caller either holds rcu_read_lock() or socket lock */
879         md5sig = rcu_dereference_check(tp->md5sig_info,
880                                        sock_owned_by_user(sk) ||
881                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
882         if (!md5sig)
883                 return NULL;
884 #if IS_ENABLED(CONFIG_IPV6)
885         if (family == AF_INET6)
886                 size = sizeof(struct in6_addr);
887 #endif
888         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
889                 if (key->family != family)
890                         continue;
891                 if (!memcmp(&key->addr, addr, size))
892                         return key;
893         }
894         return NULL;
895 }
896 EXPORT_SYMBOL(tcp_md5_do_lookup);
897
898 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
899                                          const struct sock *addr_sk)
900 {
901         const union tcp_md5_addr *addr;
902
903         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
904         return tcp_md5_do_lookup(sk, addr, AF_INET);
905 }
906 EXPORT_SYMBOL(tcp_v4_md5_lookup);
907
908 /* This can be called on a newly created socket, from other files */
909 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
910                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
911 {
912         /* Add Key to the list */
913         struct tcp_md5sig_key *key;
914         struct tcp_sock *tp = tcp_sk(sk);
915         struct tcp_md5sig_info *md5sig;
916
917         key = tcp_md5_do_lookup(sk, addr, family);
918         if (key) {
919                 /* Pre-existing entry - just update that one. */
920                 memcpy(key->key, newkey, newkeylen);
921                 key->keylen = newkeylen;
922                 return 0;
923         }
924
925         md5sig = rcu_dereference_protected(tp->md5sig_info,
926                                            sock_owned_by_user(sk) ||
927                                            lockdep_is_held(&sk->sk_lock.slock));
928         if (!md5sig) {
929                 md5sig = kmalloc(sizeof(*md5sig), gfp);
930                 if (!md5sig)
931                         return -ENOMEM;
932
933                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
934                 INIT_HLIST_HEAD(&md5sig->head);
935                 rcu_assign_pointer(tp->md5sig_info, md5sig);
936         }
937
938         key = sock_kmalloc(sk, sizeof(*key), gfp);
939         if (!key)
940                 return -ENOMEM;
941         if (!tcp_alloc_md5sig_pool()) {
942                 sock_kfree_s(sk, key, sizeof(*key));
943                 return -ENOMEM;
944         }
945
946         memcpy(key->key, newkey, newkeylen);
947         key->keylen = newkeylen;
948         key->family = family;
949         memcpy(&key->addr, addr,
950                (family == AF_INET6) ? sizeof(struct in6_addr) :
951                                       sizeof(struct in_addr));
952         hlist_add_head_rcu(&key->node, &md5sig->head);
953         return 0;
954 }
955 EXPORT_SYMBOL(tcp_md5_do_add);
956
957 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
958 {
959         struct tcp_md5sig_key *key;
960
961         key = tcp_md5_do_lookup(sk, addr, family);
962         if (!key)
963                 return -ENOENT;
964         hlist_del_rcu(&key->node);
965         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
966         kfree_rcu(key, rcu);
967         return 0;
968 }
969 EXPORT_SYMBOL(tcp_md5_do_del);
970
971 static void tcp_clear_md5_list(struct sock *sk)
972 {
973         struct tcp_sock *tp = tcp_sk(sk);
974         struct tcp_md5sig_key *key;
975         struct hlist_node *n;
976         struct tcp_md5sig_info *md5sig;
977
978         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
979
980         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
981                 hlist_del_rcu(&key->node);
982                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
983                 kfree_rcu(key, rcu);
984         }
985 }
986
987 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
988                                  int optlen)
989 {
990         struct tcp_md5sig cmd;
991         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
992
993         if (optlen < sizeof(cmd))
994                 return -EINVAL;
995
996         if (copy_from_user(&cmd, optval, sizeof(cmd)))
997                 return -EFAULT;
998
999         if (sin->sin_family != AF_INET)
1000                 return -EINVAL;
1001
1002         if (!cmd.tcpm_keylen)
1003                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1004                                       AF_INET);
1005
1006         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1007                 return -EINVAL;
1008
1009         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1010                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1011                               GFP_KERNEL);
1012 }
1013
1014 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1015                                         __be32 daddr, __be32 saddr, int nbytes)
1016 {
1017         struct tcp4_pseudohdr *bp;
1018         struct scatterlist sg;
1019
1020         bp = &hp->md5_blk.ip4;
1021
1022         /*
1023          * 1. the TCP pseudo-header (in the order: source IP address,
1024          * destination IP address, zero-padded protocol number, and
1025          * segment length)
1026          */
1027         bp->saddr = saddr;
1028         bp->daddr = daddr;
1029         bp->pad = 0;
1030         bp->protocol = IPPROTO_TCP;
1031         bp->len = cpu_to_be16(nbytes);
1032
1033         sg_init_one(&sg, bp, sizeof(*bp));
1034         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1035 }
1036
1037 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1038                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1039 {
1040         struct tcp_md5sig_pool *hp;
1041         struct hash_desc *desc;
1042
1043         hp = tcp_get_md5sig_pool();
1044         if (!hp)
1045                 goto clear_hash_noput;
1046         desc = &hp->md5_desc;
1047
1048         if (crypto_hash_init(desc))
1049                 goto clear_hash;
1050         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1051                 goto clear_hash;
1052         if (tcp_md5_hash_header(hp, th))
1053                 goto clear_hash;
1054         if (tcp_md5_hash_key(hp, key))
1055                 goto clear_hash;
1056         if (crypto_hash_final(desc, md5_hash))
1057                 goto clear_hash;
1058
1059         tcp_put_md5sig_pool();
1060         return 0;
1061
1062 clear_hash:
1063         tcp_put_md5sig_pool();
1064 clear_hash_noput:
1065         memset(md5_hash, 0, 16);
1066         return 1;
1067 }
1068
1069 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1070                         const struct sock *sk,
1071                         const struct sk_buff *skb)
1072 {
1073         struct tcp_md5sig_pool *hp;
1074         struct hash_desc *desc;
1075         const struct tcphdr *th = tcp_hdr(skb);
1076         __be32 saddr, daddr;
1077
1078         if (sk) { /* valid for establish/request sockets */
1079                 saddr = sk->sk_rcv_saddr;
1080                 daddr = sk->sk_daddr;
1081         } else {
1082                 const struct iphdr *iph = ip_hdr(skb);
1083                 saddr = iph->saddr;
1084                 daddr = iph->daddr;
1085         }
1086
1087         hp = tcp_get_md5sig_pool();
1088         if (!hp)
1089                 goto clear_hash_noput;
1090         desc = &hp->md5_desc;
1091
1092         if (crypto_hash_init(desc))
1093                 goto clear_hash;
1094
1095         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1096                 goto clear_hash;
1097         if (tcp_md5_hash_header(hp, th))
1098                 goto clear_hash;
1099         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1100                 goto clear_hash;
1101         if (tcp_md5_hash_key(hp, key))
1102                 goto clear_hash;
1103         if (crypto_hash_final(desc, md5_hash))
1104                 goto clear_hash;
1105
1106         tcp_put_md5sig_pool();
1107         return 0;
1108
1109 clear_hash:
1110         tcp_put_md5sig_pool();
1111 clear_hash_noput:
1112         memset(md5_hash, 0, 16);
1113         return 1;
1114 }
1115 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1116
1117 #endif
1118
1119 /* Called with rcu_read_lock() */
1120 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1121                                     const struct sk_buff *skb)
1122 {
1123 #ifdef CONFIG_TCP_MD5SIG
1124         /*
1125          * This gets called for each TCP segment that arrives
1126          * so we want to be efficient.
1127          * We have 3 drop cases:
1128          * o No MD5 hash and one expected.
1129          * o MD5 hash and we're not expecting one.
1130          * o MD5 hash and its wrong.
1131          */
1132         const __u8 *hash_location = NULL;
1133         struct tcp_md5sig_key *hash_expected;
1134         const struct iphdr *iph = ip_hdr(skb);
1135         const struct tcphdr *th = tcp_hdr(skb);
1136         int genhash;
1137         unsigned char newhash[16];
1138
1139         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1140                                           AF_INET);
1141         hash_location = tcp_parse_md5sig_option(th);
1142
1143         /* We've parsed the options - do we have a hash? */
1144         if (!hash_expected && !hash_location)
1145                 return false;
1146
1147         if (hash_expected && !hash_location) {
1148                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1149                 return true;
1150         }
1151
1152         if (!hash_expected && hash_location) {
1153                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1154                 return true;
1155         }
1156
1157         /* Okay, so this is hash_expected and hash_location -
1158          * so we need to calculate the checksum.
1159          */
1160         genhash = tcp_v4_md5_hash_skb(newhash,
1161                                       hash_expected,
1162                                       NULL, skb);
1163
1164         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1165                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1166                                      &iph->saddr, ntohs(th->source),
1167                                      &iph->daddr, ntohs(th->dest),
1168                                      genhash ? " tcp_v4_calc_md5_hash failed"
1169                                      : "");
1170                 return true;
1171         }
1172         return false;
1173 #endif
1174         return false;
1175 }
1176
1177 static void tcp_v4_init_req(struct request_sock *req,
1178                             const struct sock *sk_listener,
1179                             struct sk_buff *skb)
1180 {
1181         struct inet_request_sock *ireq = inet_rsk(req);
1182
1183         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1184         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1185         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1186         ireq->opt = tcp_v4_save_options(skb);
1187 }
1188
1189 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1190                                           struct flowi *fl,
1191                                           const struct request_sock *req,
1192                                           bool *strict)
1193 {
1194         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1195
1196         if (strict) {
1197                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1198                         *strict = true;
1199                 else
1200                         *strict = false;
1201         }
1202
1203         return dst;
1204 }
1205
1206 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1207         .family         =       PF_INET,
1208         .obj_size       =       sizeof(struct tcp_request_sock),
1209         .rtx_syn_ack    =       tcp_rtx_synack,
1210         .send_ack       =       tcp_v4_reqsk_send_ack,
1211         .destructor     =       tcp_v4_reqsk_destructor,
1212         .send_reset     =       tcp_v4_send_reset,
1213         .syn_ack_timeout =      tcp_syn_ack_timeout,
1214 };
1215
1216 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1217         .mss_clamp      =       TCP_MSS_DEFAULT,
1218 #ifdef CONFIG_TCP_MD5SIG
1219         .req_md5_lookup =       tcp_v4_md5_lookup,
1220         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1221 #endif
1222         .init_req       =       tcp_v4_init_req,
1223 #ifdef CONFIG_SYN_COOKIES
1224         .cookie_init_seq =      cookie_v4_init_sequence,
1225 #endif
1226         .route_req      =       tcp_v4_route_req,
1227         .init_seq       =       tcp_v4_init_sequence,
1228         .send_synack    =       tcp_v4_send_synack,
1229 };
1230
1231 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1232 {
1233         /* Never answer to SYNs send to broadcast or multicast */
1234         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1235                 goto drop;
1236
1237         return tcp_conn_request(&tcp_request_sock_ops,
1238                                 &tcp_request_sock_ipv4_ops, sk, skb);
1239
1240 drop:
1241         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1242         return 0;
1243 }
1244 EXPORT_SYMBOL(tcp_v4_conn_request);
1245
1246
1247 /*
1248  * The three way handshake has completed - we got a valid synack -
1249  * now create the new socket.
1250  */
1251 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1252                                   struct request_sock *req,
1253                                   struct dst_entry *dst,
1254                                   struct request_sock *req_unhash,
1255                                   bool *own_req)
1256 {
1257         struct inet_request_sock *ireq;
1258         struct inet_sock *newinet;
1259         struct tcp_sock *newtp;
1260         struct sock *newsk;
1261 #ifdef CONFIG_TCP_MD5SIG
1262         struct tcp_md5sig_key *key;
1263 #endif
1264         struct ip_options_rcu *inet_opt;
1265
1266         if (sk_acceptq_is_full(sk))
1267                 goto exit_overflow;
1268
1269         newsk = tcp_create_openreq_child(sk, req, skb);
1270         if (!newsk)
1271                 goto exit_nonewsk;
1272
1273         newsk->sk_gso_type = SKB_GSO_TCPV4;
1274         inet_sk_rx_dst_set(newsk, skb);
1275
1276         newtp                 = tcp_sk(newsk);
1277         newinet               = inet_sk(newsk);
1278         ireq                  = inet_rsk(req);
1279         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1280         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1281         newsk->sk_bound_dev_if = ireq->ir_iif;
1282         newinet->inet_saddr           = ireq->ir_loc_addr;
1283         inet_opt              = ireq->opt;
1284         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1285         ireq->opt             = NULL;
1286         newinet->mc_index     = inet_iif(skb);
1287         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1288         newinet->rcv_tos      = ip_hdr(skb)->tos;
1289         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1290         if (inet_opt)
1291                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1292         newinet->inet_id = newtp->write_seq ^ jiffies;
1293
1294         if (!dst) {
1295                 dst = inet_csk_route_child_sock(sk, newsk, req);
1296                 if (!dst)
1297                         goto put_and_exit;
1298         } else {
1299                 /* syncookie case : see end of cookie_v4_check() */
1300         }
1301         sk_setup_caps(newsk, dst);
1302
1303         tcp_ca_openreq_child(newsk, dst);
1304
1305         tcp_sync_mss(newsk, dst_mtu(dst));
1306         newtp->advmss = dst_metric_advmss(dst);
1307         if (tcp_sk(sk)->rx_opt.user_mss &&
1308             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1309                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1310
1311         tcp_initialize_rcv_mss(newsk);
1312
1313 #ifdef CONFIG_TCP_MD5SIG
1314         /* Copy over the MD5 key from the original socket */
1315         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1316                                 AF_INET);
1317         if (key) {
1318                 /*
1319                  * We're using one, so create a matching key
1320                  * on the newsk structure. If we fail to get
1321                  * memory, then we end up not copying the key
1322                  * across. Shucks.
1323                  */
1324                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1325                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1326                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1327         }
1328 #endif
1329
1330         if (__inet_inherit_port(sk, newsk) < 0)
1331                 goto put_and_exit;
1332         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1333         if (*own_req)
1334                 tcp_move_syn(newtp, req);
1335
1336         return newsk;
1337
1338 exit_overflow:
1339         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1340 exit_nonewsk:
1341         dst_release(dst);
1342 exit:
1343         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1344         return NULL;
1345 put_and_exit:
1346         inet_csk_prepare_forced_close(newsk);
1347         tcp_done(newsk);
1348         goto exit;
1349 }
1350 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1351
1352 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1353 {
1354 #ifdef CONFIG_SYN_COOKIES
1355         const struct tcphdr *th = tcp_hdr(skb);
1356
1357         if (!th->syn)
1358                 sk = cookie_v4_check(sk, skb);
1359 #endif
1360         return sk;
1361 }
1362
1363 /* The socket must have it's spinlock held when we get
1364  * here, unless it is a TCP_LISTEN socket.
1365  *
1366  * We have a potential double-lock case here, so even when
1367  * doing backlog processing we use the BH locking scheme.
1368  * This is because we cannot sleep with the original spinlock
1369  * held.
1370  */
1371 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1372 {
1373         struct sock *rsk;
1374
1375         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1376                 struct dst_entry *dst = sk->sk_rx_dst;
1377
1378                 sock_rps_save_rxhash(sk, skb);
1379                 sk_mark_napi_id(sk, skb);
1380                 if (dst) {
1381                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1382                             !dst->ops->check(dst, 0)) {
1383                                 dst_release(dst);
1384                                 sk->sk_rx_dst = NULL;
1385                         }
1386                 }
1387                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1388                 return 0;
1389         }
1390
1391         if (tcp_checksum_complete(skb))
1392                 goto csum_err;
1393
1394         if (sk->sk_state == TCP_LISTEN) {
1395                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1396
1397                 if (!nsk)
1398                         goto discard;
1399                 if (nsk != sk) {
1400                         sock_rps_save_rxhash(nsk, skb);
1401                         sk_mark_napi_id(nsk, skb);
1402                         if (tcp_child_process(sk, nsk, skb)) {
1403                                 rsk = nsk;
1404                                 goto reset;
1405                         }
1406                         return 0;
1407                 }
1408         } else
1409                 sock_rps_save_rxhash(sk, skb);
1410
1411         if (tcp_rcv_state_process(sk, skb)) {
1412                 rsk = sk;
1413                 goto reset;
1414         }
1415         return 0;
1416
1417 reset:
1418         tcp_v4_send_reset(rsk, skb);
1419 discard:
1420         kfree_skb(skb);
1421         /* Be careful here. If this function gets more complicated and
1422          * gcc suffers from register pressure on the x86, sk (in %ebx)
1423          * might be destroyed here. This current version compiles correctly,
1424          * but you have been warned.
1425          */
1426         return 0;
1427
1428 csum_err:
1429         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1430         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1431         goto discard;
1432 }
1433 EXPORT_SYMBOL(tcp_v4_do_rcv);
1434
1435 void tcp_v4_early_demux(struct sk_buff *skb)
1436 {
1437         const struct iphdr *iph;
1438         const struct tcphdr *th;
1439         struct sock *sk;
1440
1441         if (skb->pkt_type != PACKET_HOST)
1442                 return;
1443
1444         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1445                 return;
1446
1447         iph = ip_hdr(skb);
1448         th = tcp_hdr(skb);
1449
1450         if (th->doff < sizeof(struct tcphdr) / 4)
1451                 return;
1452
1453         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1454                                        iph->saddr, th->source,
1455                                        iph->daddr, ntohs(th->dest),
1456                                        skb->skb_iif);
1457         if (sk) {
1458                 skb->sk = sk;
1459                 skb->destructor = sock_edemux;
1460                 if (sk_fullsock(sk)) {
1461                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1462
1463                         if (dst)
1464                                 dst = dst_check(dst, 0);
1465                         if (dst &&
1466                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1467                                 skb_dst_set_noref(skb, dst);
1468                 }
1469         }
1470 }
1471
1472 /* Packet is added to VJ-style prequeue for processing in process
1473  * context, if a reader task is waiting. Apparently, this exciting
1474  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1475  * failed somewhere. Latency? Burstiness? Well, at least now we will
1476  * see, why it failed. 8)8)                               --ANK
1477  *
1478  */
1479 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1480 {
1481         struct tcp_sock *tp = tcp_sk(sk);
1482
1483         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1484                 return false;
1485
1486         if (skb->len <= tcp_hdrlen(skb) &&
1487             skb_queue_len(&tp->ucopy.prequeue) == 0)
1488                 return false;
1489
1490         /* Before escaping RCU protected region, we need to take care of skb
1491          * dst. Prequeue is only enabled for established sockets.
1492          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1493          * Instead of doing full sk_rx_dst validity here, let's perform
1494          * an optimistic check.
1495          */
1496         if (likely(sk->sk_rx_dst))
1497                 skb_dst_drop(skb);
1498         else
1499                 skb_dst_force_safe(skb);
1500
1501         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1502         tp->ucopy.memory += skb->truesize;
1503         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1504                 struct sk_buff *skb1;
1505
1506                 BUG_ON(sock_owned_by_user(sk));
1507
1508                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1509                         sk_backlog_rcv(sk, skb1);
1510                         NET_INC_STATS_BH(sock_net(sk),
1511                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1512                 }
1513
1514                 tp->ucopy.memory = 0;
1515         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1516                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1517                                            POLLIN | POLLRDNORM | POLLRDBAND);
1518                 if (!inet_csk_ack_scheduled(sk))
1519                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1520                                                   (3 * tcp_rto_min(sk)) / 4,
1521                                                   TCP_RTO_MAX);
1522         }
1523         return true;
1524 }
1525 EXPORT_SYMBOL(tcp_prequeue);
1526
1527 /*
1528  *      From tcp_input.c
1529  */
1530
1531 int tcp_v4_rcv(struct sk_buff *skb)
1532 {
1533         const struct iphdr *iph;
1534         const struct tcphdr *th;
1535         struct sock *sk;
1536         int ret;
1537         struct net *net = dev_net(skb->dev);
1538
1539         if (skb->pkt_type != PACKET_HOST)
1540                 goto discard_it;
1541
1542         /* Count it even if it's bad */
1543         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1544
1545         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1546                 goto discard_it;
1547
1548         th = tcp_hdr(skb);
1549
1550         if (th->doff < sizeof(struct tcphdr) / 4)
1551                 goto bad_packet;
1552         if (!pskb_may_pull(skb, th->doff * 4))
1553                 goto discard_it;
1554
1555         /* An explanation is required here, I think.
1556          * Packet length and doff are validated by header prediction,
1557          * provided case of th->doff==0 is eliminated.
1558          * So, we defer the checks. */
1559
1560         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1561                 goto csum_error;
1562
1563         th = tcp_hdr(skb);
1564         iph = ip_hdr(skb);
1565         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1566          * barrier() makes sure compiler wont play fool^Waliasing games.
1567          */
1568         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1569                 sizeof(struct inet_skb_parm));
1570         barrier();
1571
1572         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1573         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1574                                     skb->len - th->doff * 4);
1575         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1576         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1577         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1578         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1579         TCP_SKB_CB(skb)->sacked  = 0;
1580
1581 lookup:
1582         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1583         if (!sk)
1584                 goto no_tcp_socket;
1585
1586 process:
1587         if (sk->sk_state == TCP_TIME_WAIT)
1588                 goto do_time_wait;
1589
1590         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1591                 struct request_sock *req = inet_reqsk(sk);
1592                 struct sock *nsk = NULL;
1593
1594                 sk = req->rsk_listener;
1595                 if (tcp_v4_inbound_md5_hash(sk, skb))
1596                         goto discard_and_relse;
1597                 if (likely(sk->sk_state == TCP_LISTEN)) {
1598                         nsk = tcp_check_req(sk, skb, req, false);
1599                 } else {
1600                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1601                         goto lookup;
1602                 }
1603                 if (!nsk) {
1604                         reqsk_put(req);
1605                         goto discard_it;
1606                 }
1607                 if (nsk == sk) {
1608                         sock_hold(sk);
1609                         reqsk_put(req);
1610                 } else if (tcp_child_process(sk, nsk, skb)) {
1611                         tcp_v4_send_reset(nsk, skb);
1612                         goto discard_it;
1613                 } else {
1614                         return 0;
1615                 }
1616         }
1617         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1618                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1619                 goto discard_and_relse;
1620         }
1621
1622         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1623                 goto discard_and_relse;
1624
1625         if (tcp_v4_inbound_md5_hash(sk, skb))
1626                 goto discard_and_relse;
1627
1628         nf_reset(skb);
1629
1630         if (sk_filter(sk, skb))
1631                 goto discard_and_relse;
1632
1633         skb->dev = NULL;
1634
1635         if (sk->sk_state == TCP_LISTEN) {
1636                 ret = tcp_v4_do_rcv(sk, skb);
1637                 goto put_and_return;
1638         }
1639
1640         sk_incoming_cpu_update(sk);
1641
1642         bh_lock_sock_nested(sk);
1643         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1644         ret = 0;
1645         if (!sock_owned_by_user(sk)) {
1646                 if (!tcp_prequeue(sk, skb))
1647                         ret = tcp_v4_do_rcv(sk, skb);
1648         } else if (unlikely(sk_add_backlog(sk, skb,
1649                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1650                 bh_unlock_sock(sk);
1651                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1652                 goto discard_and_relse;
1653         }
1654         bh_unlock_sock(sk);
1655
1656 put_and_return:
1657         sock_put(sk);
1658
1659         return ret;
1660
1661 no_tcp_socket:
1662         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1663                 goto discard_it;
1664
1665         if (tcp_checksum_complete(skb)) {
1666 csum_error:
1667                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1668 bad_packet:
1669                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1670         } else {
1671                 tcp_v4_send_reset(NULL, skb);
1672         }
1673
1674 discard_it:
1675         /* Discard frame. */
1676         kfree_skb(skb);
1677         return 0;
1678
1679 discard_and_relse:
1680         sock_put(sk);
1681         goto discard_it;
1682
1683 do_time_wait:
1684         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1685                 inet_twsk_put(inet_twsk(sk));
1686                 goto discard_it;
1687         }
1688
1689         if (tcp_checksum_complete(skb)) {
1690                 inet_twsk_put(inet_twsk(sk));
1691                 goto csum_error;
1692         }
1693         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1694         case TCP_TW_SYN: {
1695                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1696                                                         &tcp_hashinfo,
1697                                                         iph->saddr, th->source,
1698                                                         iph->daddr, th->dest,
1699                                                         inet_iif(skb));
1700                 if (sk2) {
1701                         inet_twsk_deschedule_put(inet_twsk(sk));
1702                         sk = sk2;
1703                         goto process;
1704                 }
1705                 /* Fall through to ACK */
1706         }
1707         case TCP_TW_ACK:
1708                 tcp_v4_timewait_ack(sk, skb);
1709                 break;
1710         case TCP_TW_RST:
1711                 tcp_v4_send_reset(sk, skb);
1712                 inet_twsk_deschedule_put(inet_twsk(sk));
1713                 goto discard_it;
1714         case TCP_TW_SUCCESS:;
1715         }
1716         goto discard_it;
1717 }
1718
1719 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1720         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1721         .twsk_unique    = tcp_twsk_unique,
1722         .twsk_destructor= tcp_twsk_destructor,
1723 };
1724
1725 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1726 {
1727         struct dst_entry *dst = skb_dst(skb);
1728
1729         if (dst && dst_hold_safe(dst)) {
1730                 sk->sk_rx_dst = dst;
1731                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1732         }
1733 }
1734 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1735
1736 const struct inet_connection_sock_af_ops ipv4_specific = {
1737         .queue_xmit        = ip_queue_xmit,
1738         .send_check        = tcp_v4_send_check,
1739         .rebuild_header    = inet_sk_rebuild_header,
1740         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1741         .conn_request      = tcp_v4_conn_request,
1742         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1743         .net_header_len    = sizeof(struct iphdr),
1744         .setsockopt        = ip_setsockopt,
1745         .getsockopt        = ip_getsockopt,
1746         .addr2sockaddr     = inet_csk_addr2sockaddr,
1747         .sockaddr_len      = sizeof(struct sockaddr_in),
1748         .bind_conflict     = inet_csk_bind_conflict,
1749 #ifdef CONFIG_COMPAT
1750         .compat_setsockopt = compat_ip_setsockopt,
1751         .compat_getsockopt = compat_ip_getsockopt,
1752 #endif
1753         .mtu_reduced       = tcp_v4_mtu_reduced,
1754 };
1755 EXPORT_SYMBOL(ipv4_specific);
1756
1757 #ifdef CONFIG_TCP_MD5SIG
1758 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1759         .md5_lookup             = tcp_v4_md5_lookup,
1760         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1761         .md5_parse              = tcp_v4_parse_md5_keys,
1762 };
1763 #endif
1764
1765 /* NOTE: A lot of things set to zero explicitly by call to
1766  *       sk_alloc() so need not be done here.
1767  */
1768 static int tcp_v4_init_sock(struct sock *sk)
1769 {
1770         struct inet_connection_sock *icsk = inet_csk(sk);
1771
1772         tcp_init_sock(sk);
1773
1774         icsk->icsk_af_ops = &ipv4_specific;
1775
1776 #ifdef CONFIG_TCP_MD5SIG
1777         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1778 #endif
1779
1780         return 0;
1781 }
1782
1783 void tcp_v4_destroy_sock(struct sock *sk)
1784 {
1785         struct tcp_sock *tp = tcp_sk(sk);
1786
1787         tcp_clear_xmit_timers(sk);
1788
1789         tcp_cleanup_congestion_control(sk);
1790
1791         /* Cleanup up the write buffer. */
1792         tcp_write_queue_purge(sk);
1793
1794         /* Cleans up our, hopefully empty, out_of_order_queue. */
1795         __skb_queue_purge(&tp->out_of_order_queue);
1796
1797 #ifdef CONFIG_TCP_MD5SIG
1798         /* Clean up the MD5 key list, if any */
1799         if (tp->md5sig_info) {
1800                 tcp_clear_md5_list(sk);
1801                 kfree_rcu(tp->md5sig_info, rcu);
1802                 tp->md5sig_info = NULL;
1803         }
1804 #endif
1805
1806         /* Clean prequeue, it must be empty really */
1807         __skb_queue_purge(&tp->ucopy.prequeue);
1808
1809         /* Clean up a referenced TCP bind bucket. */
1810         if (inet_csk(sk)->icsk_bind_hash)
1811                 inet_put_port(sk);
1812
1813         BUG_ON(tp->fastopen_rsk);
1814
1815         /* If socket is aborted during connect operation */
1816         tcp_free_fastopen_req(tp);
1817         tcp_saved_syn_free(tp);
1818
1819         sk_sockets_allocated_dec(sk);
1820
1821         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1822                 sock_release_memcg(sk);
1823 }
1824 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1825
1826 #ifdef CONFIG_PROC_FS
1827 /* Proc filesystem TCP sock list dumping. */
1828
1829 /*
1830  * Get next listener socket follow cur.  If cur is NULL, get first socket
1831  * starting from bucket given in st->bucket; when st->bucket is zero the
1832  * very first socket in the hash table is returned.
1833  */
1834 static void *listening_get_next(struct seq_file *seq, void *cur)
1835 {
1836         struct inet_connection_sock *icsk;
1837         struct hlist_nulls_node *node;
1838         struct sock *sk = cur;
1839         struct inet_listen_hashbucket *ilb;
1840         struct tcp_iter_state *st = seq->private;
1841         struct net *net = seq_file_net(seq);
1842
1843         if (!sk) {
1844                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1845                 spin_lock_bh(&ilb->lock);
1846                 sk = sk_nulls_head(&ilb->head);
1847                 st->offset = 0;
1848                 goto get_sk;
1849         }
1850         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1851         ++st->num;
1852         ++st->offset;
1853
1854         sk = sk_nulls_next(sk);
1855 get_sk:
1856         sk_nulls_for_each_from(sk, node) {
1857                 if (!net_eq(sock_net(sk), net))
1858                         continue;
1859                 if (sk->sk_family == st->family) {
1860                         cur = sk;
1861                         goto out;
1862                 }
1863                 icsk = inet_csk(sk);
1864         }
1865         spin_unlock_bh(&ilb->lock);
1866         st->offset = 0;
1867         if (++st->bucket < INET_LHTABLE_SIZE) {
1868                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1869                 spin_lock_bh(&ilb->lock);
1870                 sk = sk_nulls_head(&ilb->head);
1871                 goto get_sk;
1872         }
1873         cur = NULL;
1874 out:
1875         return cur;
1876 }
1877
1878 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1879 {
1880         struct tcp_iter_state *st = seq->private;
1881         void *rc;
1882
1883         st->bucket = 0;
1884         st->offset = 0;
1885         rc = listening_get_next(seq, NULL);
1886
1887         while (rc && *pos) {
1888                 rc = listening_get_next(seq, rc);
1889                 --*pos;
1890         }
1891         return rc;
1892 }
1893
1894 static inline bool empty_bucket(const struct tcp_iter_state *st)
1895 {
1896         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1897 }
1898
1899 /*
1900  * Get first established socket starting from bucket given in st->bucket.
1901  * If st->bucket is zero, the very first socket in the hash is returned.
1902  */
1903 static void *established_get_first(struct seq_file *seq)
1904 {
1905         struct tcp_iter_state *st = seq->private;
1906         struct net *net = seq_file_net(seq);
1907         void *rc = NULL;
1908
1909         st->offset = 0;
1910         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1911                 struct sock *sk;
1912                 struct hlist_nulls_node *node;
1913                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1914
1915                 /* Lockless fast path for the common case of empty buckets */
1916                 if (empty_bucket(st))
1917                         continue;
1918
1919                 spin_lock_bh(lock);
1920                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1921                         if (sk->sk_family != st->family ||
1922                             !net_eq(sock_net(sk), net)) {
1923                                 continue;
1924                         }
1925                         rc = sk;
1926                         goto out;
1927                 }
1928                 spin_unlock_bh(lock);
1929         }
1930 out:
1931         return rc;
1932 }
1933
1934 static void *established_get_next(struct seq_file *seq, void *cur)
1935 {
1936         struct sock *sk = cur;
1937         struct hlist_nulls_node *node;
1938         struct tcp_iter_state *st = seq->private;
1939         struct net *net = seq_file_net(seq);
1940
1941         ++st->num;
1942         ++st->offset;
1943
1944         sk = sk_nulls_next(sk);
1945
1946         sk_nulls_for_each_from(sk, node) {
1947                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1948                         return sk;
1949         }
1950
1951         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1952         ++st->bucket;
1953         return established_get_first(seq);
1954 }
1955
1956 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1957 {
1958         struct tcp_iter_state *st = seq->private;
1959         void *rc;
1960
1961         st->bucket = 0;
1962         rc = established_get_first(seq);
1963
1964         while (rc && pos) {
1965                 rc = established_get_next(seq, rc);
1966                 --pos;
1967         }
1968         return rc;
1969 }
1970
1971 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1972 {
1973         void *rc;
1974         struct tcp_iter_state *st = seq->private;
1975
1976         st->state = TCP_SEQ_STATE_LISTENING;
1977         rc        = listening_get_idx(seq, &pos);
1978
1979         if (!rc) {
1980                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1981                 rc        = established_get_idx(seq, pos);
1982         }
1983
1984         return rc;
1985 }
1986
1987 static void *tcp_seek_last_pos(struct seq_file *seq)
1988 {
1989         struct tcp_iter_state *st = seq->private;
1990         int offset = st->offset;
1991         int orig_num = st->num;
1992         void *rc = NULL;
1993
1994         switch (st->state) {
1995         case TCP_SEQ_STATE_LISTENING:
1996                 if (st->bucket >= INET_LHTABLE_SIZE)
1997                         break;
1998                 st->state = TCP_SEQ_STATE_LISTENING;
1999                 rc = listening_get_next(seq, NULL);
2000                 while (offset-- && rc)
2001                         rc = listening_get_next(seq, rc);
2002                 if (rc)
2003                         break;
2004                 st->bucket = 0;
2005                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2006                 /* Fallthrough */
2007         case TCP_SEQ_STATE_ESTABLISHED:
2008                 if (st->bucket > tcp_hashinfo.ehash_mask)
2009                         break;
2010                 rc = established_get_first(seq);
2011                 while (offset-- && rc)
2012                         rc = established_get_next(seq, rc);
2013         }
2014
2015         st->num = orig_num;
2016
2017         return rc;
2018 }
2019
2020 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2021 {
2022         struct tcp_iter_state *st = seq->private;
2023         void *rc;
2024
2025         if (*pos && *pos == st->last_pos) {
2026                 rc = tcp_seek_last_pos(seq);
2027                 if (rc)
2028                         goto out;
2029         }
2030
2031         st->state = TCP_SEQ_STATE_LISTENING;
2032         st->num = 0;
2033         st->bucket = 0;
2034         st->offset = 0;
2035         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2036
2037 out:
2038         st->last_pos = *pos;
2039         return rc;
2040 }
2041
2042 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2043 {
2044         struct tcp_iter_state *st = seq->private;
2045         void *rc = NULL;
2046
2047         if (v == SEQ_START_TOKEN) {
2048                 rc = tcp_get_idx(seq, 0);
2049                 goto out;
2050         }
2051
2052         switch (st->state) {
2053         case TCP_SEQ_STATE_LISTENING:
2054                 rc = listening_get_next(seq, v);
2055                 if (!rc) {
2056                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2057                         st->bucket = 0;
2058                         st->offset = 0;
2059                         rc        = established_get_first(seq);
2060                 }
2061                 break;
2062         case TCP_SEQ_STATE_ESTABLISHED:
2063                 rc = established_get_next(seq, v);
2064                 break;
2065         }
2066 out:
2067         ++*pos;
2068         st->last_pos = *pos;
2069         return rc;
2070 }
2071
2072 static void tcp_seq_stop(struct seq_file *seq, void *v)
2073 {
2074         struct tcp_iter_state *st = seq->private;
2075
2076         switch (st->state) {
2077         case TCP_SEQ_STATE_LISTENING:
2078                 if (v != SEQ_START_TOKEN)
2079                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2080                 break;
2081         case TCP_SEQ_STATE_ESTABLISHED:
2082                 if (v)
2083                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2084                 break;
2085         }
2086 }
2087
2088 int tcp_seq_open(struct inode *inode, struct file *file)
2089 {
2090         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2091         struct tcp_iter_state *s;
2092         int err;
2093
2094         err = seq_open_net(inode, file, &afinfo->seq_ops,
2095                           sizeof(struct tcp_iter_state));
2096         if (err < 0)
2097                 return err;
2098
2099         s = ((struct seq_file *)file->private_data)->private;
2100         s->family               = afinfo->family;
2101         s->last_pos             = 0;
2102         return 0;
2103 }
2104 EXPORT_SYMBOL(tcp_seq_open);
2105
2106 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2107 {
2108         int rc = 0;
2109         struct proc_dir_entry *p;
2110
2111         afinfo->seq_ops.start           = tcp_seq_start;
2112         afinfo->seq_ops.next            = tcp_seq_next;
2113         afinfo->seq_ops.stop            = tcp_seq_stop;
2114
2115         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2116                              afinfo->seq_fops, afinfo);
2117         if (!p)
2118                 rc = -ENOMEM;
2119         return rc;
2120 }
2121 EXPORT_SYMBOL(tcp_proc_register);
2122
2123 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2124 {
2125         remove_proc_entry(afinfo->name, net->proc_net);
2126 }
2127 EXPORT_SYMBOL(tcp_proc_unregister);
2128
2129 static void get_openreq4(const struct request_sock *req,
2130                          struct seq_file *f, int i)
2131 {
2132         const struct inet_request_sock *ireq = inet_rsk(req);
2133         long delta = req->rsk_timer.expires - jiffies;
2134
2135         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2136                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2137                 i,
2138                 ireq->ir_loc_addr,
2139                 ireq->ir_num,
2140                 ireq->ir_rmt_addr,
2141                 ntohs(ireq->ir_rmt_port),
2142                 TCP_SYN_RECV,
2143                 0, 0, /* could print option size, but that is af dependent. */
2144                 1,    /* timers active (only the expire timer) */
2145                 jiffies_delta_to_clock_t(delta),
2146                 req->num_timeout,
2147                 from_kuid_munged(seq_user_ns(f),
2148                                  sock_i_uid(req->rsk_listener)),
2149                 0,  /* non standard timer */
2150                 0, /* open_requests have no inode */
2151                 0,
2152                 req);
2153 }
2154
2155 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2156 {
2157         int timer_active;
2158         unsigned long timer_expires;
2159         const struct tcp_sock *tp = tcp_sk(sk);
2160         const struct inet_connection_sock *icsk = inet_csk(sk);
2161         const struct inet_sock *inet = inet_sk(sk);
2162         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2163         __be32 dest = inet->inet_daddr;
2164         __be32 src = inet->inet_rcv_saddr;
2165         __u16 destp = ntohs(inet->inet_dport);
2166         __u16 srcp = ntohs(inet->inet_sport);
2167         int rx_queue;
2168         int state;
2169
2170         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2171             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2172             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2173                 timer_active    = 1;
2174                 timer_expires   = icsk->icsk_timeout;
2175         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2176                 timer_active    = 4;
2177                 timer_expires   = icsk->icsk_timeout;
2178         } else if (timer_pending(&sk->sk_timer)) {
2179                 timer_active    = 2;
2180                 timer_expires   = sk->sk_timer.expires;
2181         } else {
2182                 timer_active    = 0;
2183                 timer_expires = jiffies;
2184         }
2185
2186         state = sk_state_load(sk);
2187         if (state == TCP_LISTEN)
2188                 rx_queue = sk->sk_ack_backlog;
2189         else
2190                 /* Because we don't lock the socket,
2191                  * we might find a transient negative value.
2192                  */
2193                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2194
2195         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2196                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2197                 i, src, srcp, dest, destp, state,
2198                 tp->write_seq - tp->snd_una,
2199                 rx_queue,
2200                 timer_active,
2201                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2202                 icsk->icsk_retransmits,
2203                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2204                 icsk->icsk_probes_out,
2205                 sock_i_ino(sk),
2206                 atomic_read(&sk->sk_refcnt), sk,
2207                 jiffies_to_clock_t(icsk->icsk_rto),
2208                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2209                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2210                 tp->snd_cwnd,
2211                 state == TCP_LISTEN ?
2212                     fastopenq->max_qlen :
2213                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2214 }
2215
2216 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2217                                struct seq_file *f, int i)
2218 {
2219         long delta = tw->tw_timer.expires - jiffies;
2220         __be32 dest, src;
2221         __u16 destp, srcp;
2222
2223         dest  = tw->tw_daddr;
2224         src   = tw->tw_rcv_saddr;
2225         destp = ntohs(tw->tw_dport);
2226         srcp  = ntohs(tw->tw_sport);
2227
2228         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2229                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2230                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2231                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2232                 atomic_read(&tw->tw_refcnt), tw);
2233 }
2234
2235 #define TMPSZ 150
2236
2237 static int tcp4_seq_show(struct seq_file *seq, void *v)
2238 {
2239         struct tcp_iter_state *st;
2240         struct sock *sk = v;
2241
2242         seq_setwidth(seq, TMPSZ - 1);
2243         if (v == SEQ_START_TOKEN) {
2244                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2245                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2246                            "inode");
2247                 goto out;
2248         }
2249         st = seq->private;
2250
2251         if (sk->sk_state == TCP_TIME_WAIT)
2252                 get_timewait4_sock(v, seq, st->num);
2253         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2254                 get_openreq4(v, seq, st->num);
2255         else
2256                 get_tcp4_sock(v, seq, st->num);
2257 out:
2258         seq_pad(seq, '\n');
2259         return 0;
2260 }
2261
2262 static const struct file_operations tcp_afinfo_seq_fops = {
2263         .owner   = THIS_MODULE,
2264         .open    = tcp_seq_open,
2265         .read    = seq_read,
2266         .llseek  = seq_lseek,
2267         .release = seq_release_net
2268 };
2269
2270 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2271         .name           = "tcp",
2272         .family         = AF_INET,
2273         .seq_fops       = &tcp_afinfo_seq_fops,
2274         .seq_ops        = {
2275                 .show           = tcp4_seq_show,
2276         },
2277 };
2278
2279 static int __net_init tcp4_proc_init_net(struct net *net)
2280 {
2281         return tcp_proc_register(net, &tcp4_seq_afinfo);
2282 }
2283
2284 static void __net_exit tcp4_proc_exit_net(struct net *net)
2285 {
2286         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2287 }
2288
2289 static struct pernet_operations tcp4_net_ops = {
2290         .init = tcp4_proc_init_net,
2291         .exit = tcp4_proc_exit_net,
2292 };
2293
2294 int __init tcp4_proc_init(void)
2295 {
2296         return register_pernet_subsys(&tcp4_net_ops);
2297 }
2298
2299 void tcp4_proc_exit(void)
2300 {
2301         unregister_pernet_subsys(&tcp4_net_ops);
2302 }
2303 #endif /* CONFIG_PROC_FS */
2304
2305 struct proto tcp_prot = {
2306         .name                   = "TCP",
2307         .owner                  = THIS_MODULE,
2308         .close                  = tcp_close,
2309         .connect                = tcp_v4_connect,
2310         .disconnect             = tcp_disconnect,
2311         .accept                 = inet_csk_accept,
2312         .ioctl                  = tcp_ioctl,
2313         .init                   = tcp_v4_init_sock,
2314         .destroy                = tcp_v4_destroy_sock,
2315         .shutdown               = tcp_shutdown,
2316         .setsockopt             = tcp_setsockopt,
2317         .getsockopt             = tcp_getsockopt,
2318         .recvmsg                = tcp_recvmsg,
2319         .sendmsg                = tcp_sendmsg,
2320         .sendpage               = tcp_sendpage,
2321         .backlog_rcv            = tcp_v4_do_rcv,
2322         .release_cb             = tcp_release_cb,
2323         .hash                   = inet_hash,
2324         .unhash                 = inet_unhash,
2325         .get_port               = inet_csk_get_port,
2326         .enter_memory_pressure  = tcp_enter_memory_pressure,
2327         .stream_memory_free     = tcp_stream_memory_free,
2328         .sockets_allocated      = &tcp_sockets_allocated,
2329         .orphan_count           = &tcp_orphan_count,
2330         .memory_allocated       = &tcp_memory_allocated,
2331         .memory_pressure        = &tcp_memory_pressure,
2332         .sysctl_mem             = sysctl_tcp_mem,
2333         .sysctl_wmem            = sysctl_tcp_wmem,
2334         .sysctl_rmem            = sysctl_tcp_rmem,
2335         .max_header             = MAX_TCP_HEADER,
2336         .obj_size               = sizeof(struct tcp_sock),
2337         .slab_flags             = SLAB_DESTROY_BY_RCU,
2338         .twsk_prot              = &tcp_timewait_sock_ops,
2339         .rsk_prot               = &tcp_request_sock_ops,
2340         .h.hashinfo             = &tcp_hashinfo,
2341         .no_autobind            = true,
2342 #ifdef CONFIG_COMPAT
2343         .compat_setsockopt      = compat_tcp_setsockopt,
2344         .compat_getsockopt      = compat_tcp_getsockopt,
2345 #endif
2346         .diag_destroy           = tcp_abort,
2347 };
2348 EXPORT_SYMBOL(tcp_prot);
2349
2350 static void __net_exit tcp_sk_exit(struct net *net)
2351 {
2352         int cpu;
2353
2354         for_each_possible_cpu(cpu)
2355                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2356         free_percpu(net->ipv4.tcp_sk);
2357 }
2358
2359 static int __net_init tcp_sk_init(struct net *net)
2360 {
2361         int res, cpu;
2362
2363         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2364         if (!net->ipv4.tcp_sk)
2365                 return -ENOMEM;
2366
2367         for_each_possible_cpu(cpu) {
2368                 struct sock *sk;
2369
2370                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2371                                            IPPROTO_TCP, net);
2372                 if (res)
2373                         goto fail;
2374                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2375         }
2376
2377         net->ipv4.sysctl_tcp_ecn = 2;
2378         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2379
2380         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2381         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2382         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2383
2384         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2385         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2386         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2387
2388         return 0;
2389 fail:
2390         tcp_sk_exit(net);
2391
2392         return res;
2393 }
2394
2395 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2396 {
2397         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2398 }
2399
2400 static struct pernet_operations __net_initdata tcp_sk_ops = {
2401        .init       = tcp_sk_init,
2402        .exit       = tcp_sk_exit,
2403        .exit_batch = tcp_sk_exit_batch,
2404 };
2405
2406 void __init tcp_v4_init(void)
2407 {
2408         inet_hashinfo_init(&tcp_hashinfo);
2409         if (register_pernet_subsys(&tcp_sk_ops))
2410                 panic("Failed to create the TCP control socket.\n");
2411 }