2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95 __be32 daddr, __be32 saddr, const struct tcphdr *th);
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
103 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106 tcp_hdr(skb)->source);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
125 if (tcptw->tw_ts_recent_stamp &&
126 (twp == NULL || (sysctl_tcp_tw_reuse &&
127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 if (tp->write_seq == 0)
131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
141 static int tcp_repair_connect(struct sock *sk)
143 tcp_connect_init(sk);
144 tcp_finish_connect(sk, NULL);
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
152 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153 struct inet_sock *inet = inet_sk(sk);
154 struct tcp_sock *tp = tcp_sk(sk);
155 __be16 orig_sport, orig_dport;
156 __be32 daddr, nexthop;
160 struct ip_options_rcu *inet_opt;
162 if (addr_len < sizeof(struct sockaddr_in))
165 if (usin->sin_family != AF_INET)
166 return -EAFNOSUPPORT;
168 nexthop = daddr = usin->sin_addr.s_addr;
169 inet_opt = rcu_dereference_protected(inet->inet_opt,
170 sock_owned_by_user(sk));
171 if (inet_opt && inet_opt->opt.srr) {
174 nexthop = inet_opt->opt.faddr;
177 orig_sport = inet->inet_sport;
178 orig_dport = usin->sin_port;
179 fl4 = &inet->cork.fl.u.ip4;
180 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
181 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
183 orig_sport, orig_dport, sk, true);
186 if (err == -ENETUNREACH)
187 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
191 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
196 if (!inet_opt || !inet_opt->opt.srr)
199 if (!inet->inet_saddr)
200 inet->inet_saddr = fl4->saddr;
201 inet->inet_rcv_saddr = inet->inet_saddr;
203 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
204 /* Reset inherited state */
205 tp->rx_opt.ts_recent = 0;
206 tp->rx_opt.ts_recent_stamp = 0;
207 if (likely(!tp->repair))
211 if (tcp_death_row.sysctl_tw_recycle &&
212 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
213 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
215 * VJ's idea. We save last timestamp seen from
216 * the destination in peer table, when entering state
217 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
218 * when trying new connection.
221 inet_peer_refcheck(peer);
222 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
223 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
224 tp->rx_opt.ts_recent = peer->tcp_ts;
229 inet->inet_dport = usin->sin_port;
230 inet->inet_daddr = daddr;
232 inet_csk(sk)->icsk_ext_hdr_len = 0;
234 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
236 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
238 /* Socket identity is still unknown (sport may be zero).
239 * However we set state to SYN-SENT and not releasing socket
240 * lock select source port, enter ourselves into the hash tables and
241 * complete initialization after this.
243 tcp_set_state(sk, TCP_SYN_SENT);
244 err = inet_hash_connect(&tcp_death_row, sk);
248 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
249 inet->inet_sport, inet->inet_dport, sk);
255 /* OK, now commit destination to socket. */
256 sk->sk_gso_type = SKB_GSO_TCPV4;
257 sk_setup_caps(sk, &rt->dst);
259 if (!tp->write_seq && likely(!tp->repair))
260 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
265 inet->inet_id = tp->write_seq ^ jiffies;
267 if (likely(!tp->repair))
268 err = tcp_connect(sk);
270 err = tcp_repair_connect(sk);
280 * This unhashes the socket and releases the local port,
283 tcp_set_state(sk, TCP_CLOSE);
285 sk->sk_route_caps = 0;
286 inet->inet_dport = 0;
289 EXPORT_SYMBOL(tcp_v4_connect);
292 * This routine does path mtu discovery as defined in RFC1191.
294 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
296 struct dst_entry *dst;
297 struct inet_sock *inet = inet_sk(sk);
299 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300 * send out by Linux are always <576bytes so they should go through
303 if (sk->sk_state == TCP_LISTEN)
306 /* We don't check in the destentry if pmtu discovery is forbidden
307 * on this route. We just assume that no packet_to_big packets
308 * are send back when pmtu discovery is not active.
309 * There is a small race when the user changes this flag in the
310 * route, but I think that's acceptable.
312 if ((dst = __sk_dst_check(sk, 0)) == NULL)
315 dst->ops->update_pmtu(dst, mtu);
317 /* Something is about to be wrong... Remember soft error
318 * for the case, if this connection will not able to recover.
320 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
321 sk->sk_err_soft = EMSGSIZE;
325 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
326 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
327 tcp_sync_mss(sk, mtu);
329 /* Resend the TCP packet because it's
330 * clear that the old packet has been
331 * dropped. This is the new "fast" path mtu
334 tcp_simple_retransmit(sk);
335 } /* else let the usual retransmit timer handle it */
339 * This routine is called by the ICMP module when it gets some
340 * sort of error condition. If err < 0 then the socket should
341 * be closed and the error returned to the user. If err > 0
342 * it's just the icmp type << 8 | icmp code. After adjustment
343 * header points to the first 8 bytes of the tcp header. We need
344 * to find the appropriate port.
346 * The locking strategy used here is very "optimistic". When
347 * someone else accesses the socket the ICMP is just dropped
348 * and for some paths there is no check at all.
349 * A more general error queue to queue errors for later handling
350 * is probably better.
354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
356 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
357 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
358 struct inet_connection_sock *icsk;
360 struct inet_sock *inet;
361 const int type = icmp_hdr(icmp_skb)->type;
362 const int code = icmp_hdr(icmp_skb)->code;
368 struct net *net = dev_net(icmp_skb->dev);
370 if (icmp_skb->len < (iph->ihl << 2) + 8) {
371 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
375 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
376 iph->saddr, th->source, inet_iif(icmp_skb));
378 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
381 if (sk->sk_state == TCP_TIME_WAIT) {
382 inet_twsk_put(inet_twsk(sk));
387 /* If too many ICMPs get dropped on busy
388 * servers this needs to be solved differently.
390 if (sock_owned_by_user(sk))
391 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
393 if (sk->sk_state == TCP_CLOSE)
396 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
397 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
403 seq = ntohl(th->seq);
404 if (sk->sk_state != TCP_LISTEN &&
405 !between(seq, tp->snd_una, tp->snd_nxt)) {
406 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
411 case ICMP_SOURCE_QUENCH:
412 /* Just silently ignore these. */
414 case ICMP_PARAMETERPROB:
417 case ICMP_DEST_UNREACH:
418 if (code > NR_ICMP_UNREACH)
421 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
422 if (!sock_owned_by_user(sk))
423 do_pmtu_discovery(sk, iph, info);
427 err = icmp_err_convert[code].errno;
428 /* check if icmp_skb allows revert of backoff
429 * (see draft-zimmermann-tcp-lcd) */
430 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
432 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
436 if (sock_owned_by_user(sk))
439 icsk->icsk_backoff--;
440 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
441 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
444 skb = tcp_write_queue_head(sk);
447 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
448 tcp_time_stamp - TCP_SKB_CB(skb)->when);
451 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
452 remaining, TCP_RTO_MAX);
454 /* RTO revert clocked out retransmission.
455 * Will retransmit now */
456 tcp_retransmit_timer(sk);
460 case ICMP_TIME_EXCEEDED:
467 switch (sk->sk_state) {
468 struct request_sock *req, **prev;
470 if (sock_owned_by_user(sk))
473 req = inet_csk_search_req(sk, &prev, th->dest,
474 iph->daddr, iph->saddr);
478 /* ICMPs are not backlogged, hence we cannot get
479 an established socket here.
483 if (seq != tcp_rsk(req)->snt_isn) {
484 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
489 * Still in SYN_RECV, just remove it silently.
490 * There is no good way to pass the error to the newly
491 * created socket, and POSIX does not want network
492 * errors returned from accept().
494 inet_csk_reqsk_queue_drop(sk, req, prev);
498 case TCP_SYN_RECV: /* Cannot happen.
499 It can f.e. if SYNs crossed.
501 if (!sock_owned_by_user(sk)) {
504 sk->sk_error_report(sk);
508 sk->sk_err_soft = err;
513 /* If we've already connected we will keep trying
514 * until we time out, or the user gives up.
516 * rfc1122 4.2.3.9 allows to consider as hard errors
517 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
518 * but it is obsoleted by pmtu discovery).
520 * Note, that in modern internet, where routing is unreliable
521 * and in each dark corner broken firewalls sit, sending random
522 * errors ordered by their masters even this two messages finally lose
523 * their original sense (even Linux sends invalid PORT_UNREACHs)
525 * Now we are in compliance with RFCs.
530 if (!sock_owned_by_user(sk) && inet->recverr) {
532 sk->sk_error_report(sk);
533 } else { /* Only an error on timeout */
534 sk->sk_err_soft = err;
542 static void __tcp_v4_send_check(struct sk_buff *skb,
543 __be32 saddr, __be32 daddr)
545 struct tcphdr *th = tcp_hdr(skb);
547 if (skb->ip_summed == CHECKSUM_PARTIAL) {
548 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549 skb->csum_start = skb_transport_header(skb) - skb->head;
550 skb->csum_offset = offsetof(struct tcphdr, check);
552 th->check = tcp_v4_check(skb->len, saddr, daddr,
559 /* This routine computes an IPv4 TCP checksum. */
560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
562 const struct inet_sock *inet = inet_sk(sk);
564 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
566 EXPORT_SYMBOL(tcp_v4_send_check);
568 int tcp_v4_gso_send_check(struct sk_buff *skb)
570 const struct iphdr *iph;
573 if (!pskb_may_pull(skb, sizeof(*th)))
580 skb->ip_summed = CHECKSUM_PARTIAL;
581 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
586 * This routine will send an RST to the other tcp.
588 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
590 * Answer: if a packet caused RST, it is not for a socket
591 * existing in our system, if it is matched to a socket,
592 * it is just duplicate segment or bug in other side's TCP.
593 * So that we build reply only basing on parameters
594 * arrived with segment.
595 * Exception: precedence violation. We do not implement it in any case.
598 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
600 const struct tcphdr *th = tcp_hdr(skb);
603 #ifdef CONFIG_TCP_MD5SIG
604 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
607 struct ip_reply_arg arg;
608 #ifdef CONFIG_TCP_MD5SIG
609 struct tcp_md5sig_key *key;
610 const __u8 *hash_location = NULL;
611 unsigned char newhash[16];
613 struct sock *sk1 = NULL;
617 /* Never send a reset in response to a reset. */
621 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
624 /* Swap the send and the receive. */
625 memset(&rep, 0, sizeof(rep));
626 rep.th.dest = th->source;
627 rep.th.source = th->dest;
628 rep.th.doff = sizeof(struct tcphdr) / 4;
632 rep.th.seq = th->ack_seq;
635 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
636 skb->len - (th->doff << 2));
639 memset(&arg, 0, sizeof(arg));
640 arg.iov[0].iov_base = (unsigned char *)&rep;
641 arg.iov[0].iov_len = sizeof(rep.th);
643 #ifdef CONFIG_TCP_MD5SIG
644 hash_location = tcp_parse_md5sig_option(th);
645 if (!sk && hash_location) {
647 * active side is lost. Try to find listening socket through
648 * source port, and then find md5 key through listening socket.
649 * we are not loose security here:
650 * Incoming packet is checked with md5 hash with finding key,
651 * no RST generated if md5 hash doesn't match.
653 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
654 &tcp_hashinfo, ip_hdr(skb)->daddr,
655 ntohs(th->source), inet_iif(skb));
656 /* don't send rst if it can't find key */
660 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
661 &ip_hdr(skb)->saddr, AF_INET);
665 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
666 if (genhash || memcmp(hash_location, newhash, 16) != 0)
669 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
675 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
677 (TCPOPT_MD5SIG << 8) |
679 /* Update length and the length the header thinks exists */
680 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
681 rep.th.doff = arg.iov[0].iov_len / 4;
683 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
684 key, ip_hdr(skb)->saddr,
685 ip_hdr(skb)->daddr, &rep.th);
688 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
689 ip_hdr(skb)->saddr, /* XXX */
690 arg.iov[0].iov_len, IPPROTO_TCP, 0);
691 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
692 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
693 /* When socket is gone, all binding information is lost.
694 * routing might fail in this case. using iif for oif to
695 * make sure we can deliver it
697 arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
699 net = dev_net(skb_dst(skb)->dev);
700 arg.tos = ip_hdr(skb)->tos;
701 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
702 &arg, arg.iov[0].iov_len);
704 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
705 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
707 #ifdef CONFIG_TCP_MD5SIG
716 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
717 outside socket context is ugly, certainly. What can I do?
720 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
721 u32 win, u32 ts, int oif,
722 struct tcp_md5sig_key *key,
723 int reply_flags, u8 tos)
725 const struct tcphdr *th = tcp_hdr(skb);
728 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
729 #ifdef CONFIG_TCP_MD5SIG
730 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
734 struct ip_reply_arg arg;
735 struct net *net = dev_net(skb_dst(skb)->dev);
737 memset(&rep.th, 0, sizeof(struct tcphdr));
738 memset(&arg, 0, sizeof(arg));
740 arg.iov[0].iov_base = (unsigned char *)&rep;
741 arg.iov[0].iov_len = sizeof(rep.th);
743 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
744 (TCPOPT_TIMESTAMP << 8) |
746 rep.opt[1] = htonl(tcp_time_stamp);
747 rep.opt[2] = htonl(ts);
748 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
751 /* Swap the send and the receive. */
752 rep.th.dest = th->source;
753 rep.th.source = th->dest;
754 rep.th.doff = arg.iov[0].iov_len / 4;
755 rep.th.seq = htonl(seq);
756 rep.th.ack_seq = htonl(ack);
758 rep.th.window = htons(win);
760 #ifdef CONFIG_TCP_MD5SIG
762 int offset = (ts) ? 3 : 0;
764 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
766 (TCPOPT_MD5SIG << 8) |
768 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
769 rep.th.doff = arg.iov[0].iov_len/4;
771 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
772 key, ip_hdr(skb)->saddr,
773 ip_hdr(skb)->daddr, &rep.th);
776 arg.flags = reply_flags;
777 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
778 ip_hdr(skb)->saddr, /* XXX */
779 arg.iov[0].iov_len, IPPROTO_TCP, 0);
780 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
782 arg.bound_dev_if = oif;
784 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
785 &arg, arg.iov[0].iov_len);
787 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
790 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
792 struct inet_timewait_sock *tw = inet_twsk(sk);
793 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
795 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
799 tcp_twsk_md5_key(tcptw),
800 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
807 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
808 struct request_sock *req)
810 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
811 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
814 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
816 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
821 * Send a SYN-ACK after having received a SYN.
822 * This still operates on a request_sock only, not on a big
825 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
826 struct request_sock *req,
827 struct request_values *rvp,
830 const struct inet_request_sock *ireq = inet_rsk(req);
833 struct sk_buff * skb;
835 /* First, grab a route. */
836 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
839 skb = tcp_make_synack(sk, dst, req, rvp);
842 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
844 skb_set_queue_mapping(skb, queue_mapping);
845 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
848 err = net_xmit_eval(err);
854 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
855 struct request_values *rvp)
857 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
858 return tcp_v4_send_synack(sk, NULL, req, rvp, 0);
862 * IPv4 request_sock destructor.
864 static void tcp_v4_reqsk_destructor(struct request_sock *req)
866 kfree(inet_rsk(req)->opt);
870 * Return true if a syncookie should be sent
872 bool tcp_syn_flood_action(struct sock *sk,
873 const struct sk_buff *skb,
876 const char *msg = "Dropping request";
877 bool want_cookie = false;
878 struct listen_sock *lopt;
882 #ifdef CONFIG_SYN_COOKIES
883 if (sysctl_tcp_syncookies) {
884 msg = "Sending cookies";
886 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
889 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
891 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
892 if (!lopt->synflood_warned) {
893 lopt->synflood_warned = 1;
894 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
895 proto, ntohs(tcp_hdr(skb)->dest), msg);
899 EXPORT_SYMBOL(tcp_syn_flood_action);
902 * Save and compile IPv4 options into the request_sock if needed.
904 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
907 const struct ip_options *opt = &(IPCB(skb)->opt);
908 struct ip_options_rcu *dopt = NULL;
910 if (opt && opt->optlen) {
911 int opt_size = sizeof(*dopt) + opt->optlen;
913 dopt = kmalloc(opt_size, GFP_ATOMIC);
915 if (ip_options_echo(&dopt->opt, skb)) {
924 #ifdef CONFIG_TCP_MD5SIG
926 * RFC2385 MD5 checksumming requires a mapping of
927 * IP address->MD5 Key.
928 * We need to maintain these in the sk structure.
931 /* Find the Key structure for an address. */
932 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
933 const union tcp_md5_addr *addr,
936 struct tcp_sock *tp = tcp_sk(sk);
937 struct tcp_md5sig_key *key;
938 struct hlist_node *pos;
939 unsigned int size = sizeof(struct in_addr);
940 struct tcp_md5sig_info *md5sig;
942 /* caller either holds rcu_read_lock() or socket lock */
943 md5sig = rcu_dereference_check(tp->md5sig_info,
944 sock_owned_by_user(sk) ||
945 lockdep_is_held(&sk->sk_lock.slock));
948 #if IS_ENABLED(CONFIG_IPV6)
949 if (family == AF_INET6)
950 size = sizeof(struct in6_addr);
952 hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
953 if (key->family != family)
955 if (!memcmp(&key->addr, addr, size))
960 EXPORT_SYMBOL(tcp_md5_do_lookup);
962 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
963 struct sock *addr_sk)
965 union tcp_md5_addr *addr;
967 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
968 return tcp_md5_do_lookup(sk, addr, AF_INET);
970 EXPORT_SYMBOL(tcp_v4_md5_lookup);
972 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
973 struct request_sock *req)
975 union tcp_md5_addr *addr;
977 addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
978 return tcp_md5_do_lookup(sk, addr, AF_INET);
981 /* This can be called on a newly created socket, from other files */
982 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
983 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
985 /* Add Key to the list */
986 struct tcp_md5sig_key *key;
987 struct tcp_sock *tp = tcp_sk(sk);
988 struct tcp_md5sig_info *md5sig;
990 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
992 /* Pre-existing entry - just update that one. */
993 memcpy(key->key, newkey, newkeylen);
994 key->keylen = newkeylen;
998 md5sig = rcu_dereference_protected(tp->md5sig_info,
999 sock_owned_by_user(sk));
1001 md5sig = kmalloc(sizeof(*md5sig), gfp);
1005 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1006 INIT_HLIST_HEAD(&md5sig->head);
1007 rcu_assign_pointer(tp->md5sig_info, md5sig);
1010 key = sock_kmalloc(sk, sizeof(*key), gfp);
1013 if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1014 sock_kfree_s(sk, key, sizeof(*key));
1018 memcpy(key->key, newkey, newkeylen);
1019 key->keylen = newkeylen;
1020 key->family = family;
1021 memcpy(&key->addr, addr,
1022 (family == AF_INET6) ? sizeof(struct in6_addr) :
1023 sizeof(struct in_addr));
1024 hlist_add_head_rcu(&key->node, &md5sig->head);
1027 EXPORT_SYMBOL(tcp_md5_do_add);
1029 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1031 struct tcp_sock *tp = tcp_sk(sk);
1032 struct tcp_md5sig_key *key;
1033 struct tcp_md5sig_info *md5sig;
1035 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1038 hlist_del_rcu(&key->node);
1039 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1040 kfree_rcu(key, rcu);
1041 md5sig = rcu_dereference_protected(tp->md5sig_info,
1042 sock_owned_by_user(sk));
1043 if (hlist_empty(&md5sig->head))
1044 tcp_free_md5sig_pool();
1047 EXPORT_SYMBOL(tcp_md5_do_del);
1049 void tcp_clear_md5_list(struct sock *sk)
1051 struct tcp_sock *tp = tcp_sk(sk);
1052 struct tcp_md5sig_key *key;
1053 struct hlist_node *pos, *n;
1054 struct tcp_md5sig_info *md5sig;
1056 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1058 if (!hlist_empty(&md5sig->head))
1059 tcp_free_md5sig_pool();
1060 hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1061 hlist_del_rcu(&key->node);
1062 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1063 kfree_rcu(key, rcu);
1067 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1070 struct tcp_md5sig cmd;
1071 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1073 if (optlen < sizeof(cmd))
1076 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1079 if (sin->sin_family != AF_INET)
1082 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1083 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1086 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1089 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1090 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1094 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1095 __be32 daddr, __be32 saddr, int nbytes)
1097 struct tcp4_pseudohdr *bp;
1098 struct scatterlist sg;
1100 bp = &hp->md5_blk.ip4;
1103 * 1. the TCP pseudo-header (in the order: source IP address,
1104 * destination IP address, zero-padded protocol number, and
1110 bp->protocol = IPPROTO_TCP;
1111 bp->len = cpu_to_be16(nbytes);
1113 sg_init_one(&sg, bp, sizeof(*bp));
1114 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1117 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1118 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1120 struct tcp_md5sig_pool *hp;
1121 struct hash_desc *desc;
1123 hp = tcp_get_md5sig_pool();
1125 goto clear_hash_noput;
1126 desc = &hp->md5_desc;
1128 if (crypto_hash_init(desc))
1130 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1132 if (tcp_md5_hash_header(hp, th))
1134 if (tcp_md5_hash_key(hp, key))
1136 if (crypto_hash_final(desc, md5_hash))
1139 tcp_put_md5sig_pool();
1143 tcp_put_md5sig_pool();
1145 memset(md5_hash, 0, 16);
1149 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1150 const struct sock *sk, const struct request_sock *req,
1151 const struct sk_buff *skb)
1153 struct tcp_md5sig_pool *hp;
1154 struct hash_desc *desc;
1155 const struct tcphdr *th = tcp_hdr(skb);
1156 __be32 saddr, daddr;
1159 saddr = inet_sk(sk)->inet_saddr;
1160 daddr = inet_sk(sk)->inet_daddr;
1162 saddr = inet_rsk(req)->loc_addr;
1163 daddr = inet_rsk(req)->rmt_addr;
1165 const struct iphdr *iph = ip_hdr(skb);
1170 hp = tcp_get_md5sig_pool();
1172 goto clear_hash_noput;
1173 desc = &hp->md5_desc;
1175 if (crypto_hash_init(desc))
1178 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1180 if (tcp_md5_hash_header(hp, th))
1182 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1184 if (tcp_md5_hash_key(hp, key))
1186 if (crypto_hash_final(desc, md5_hash))
1189 tcp_put_md5sig_pool();
1193 tcp_put_md5sig_pool();
1195 memset(md5_hash, 0, 16);
1198 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1200 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1203 * This gets called for each TCP segment that arrives
1204 * so we want to be efficient.
1205 * We have 3 drop cases:
1206 * o No MD5 hash and one expected.
1207 * o MD5 hash and we're not expecting one.
1208 * o MD5 hash and its wrong.
1210 const __u8 *hash_location = NULL;
1211 struct tcp_md5sig_key *hash_expected;
1212 const struct iphdr *iph = ip_hdr(skb);
1213 const struct tcphdr *th = tcp_hdr(skb);
1215 unsigned char newhash[16];
1217 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1219 hash_location = tcp_parse_md5sig_option(th);
1221 /* We've parsed the options - do we have a hash? */
1222 if (!hash_expected && !hash_location)
1225 if (hash_expected && !hash_location) {
1226 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1230 if (!hash_expected && hash_location) {
1231 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1235 /* Okay, so this is hash_expected and hash_location -
1236 * so we need to calculate the checksum.
1238 genhash = tcp_v4_md5_hash_skb(newhash,
1242 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1243 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1244 &iph->saddr, ntohs(th->source),
1245 &iph->daddr, ntohs(th->dest),
1246 genhash ? " tcp_v4_calc_md5_hash failed"
1255 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1257 .obj_size = sizeof(struct tcp_request_sock),
1258 .rtx_syn_ack = tcp_v4_rtx_synack,
1259 .send_ack = tcp_v4_reqsk_send_ack,
1260 .destructor = tcp_v4_reqsk_destructor,
1261 .send_reset = tcp_v4_send_reset,
1262 .syn_ack_timeout = tcp_syn_ack_timeout,
1265 #ifdef CONFIG_TCP_MD5SIG
1266 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1267 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1268 .calc_md5_hash = tcp_v4_md5_hash_skb,
1272 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1274 struct tcp_extend_values tmp_ext;
1275 struct tcp_options_received tmp_opt;
1276 const u8 *hash_location;
1277 struct request_sock *req;
1278 struct inet_request_sock *ireq;
1279 struct tcp_sock *tp = tcp_sk(sk);
1280 struct dst_entry *dst = NULL;
1281 __be32 saddr = ip_hdr(skb)->saddr;
1282 __be32 daddr = ip_hdr(skb)->daddr;
1283 __u32 isn = TCP_SKB_CB(skb)->when;
1284 bool want_cookie = false;
1286 /* Never answer to SYNs send to broadcast or multicast */
1287 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1290 /* TW buckets are converted to open requests without
1291 * limitations, they conserve resources and peer is
1292 * evidently real one.
1294 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1295 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1300 /* Accept backlog is full. If we have already queued enough
1301 * of warm entries in syn queue, drop request. It is better than
1302 * clogging syn queue with openreqs with exponentially increasing
1305 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1308 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1312 #ifdef CONFIG_TCP_MD5SIG
1313 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1316 tcp_clear_options(&tmp_opt);
1317 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1318 tmp_opt.user_mss = tp->rx_opt.user_mss;
1319 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1321 if (tmp_opt.cookie_plus > 0 &&
1322 tmp_opt.saw_tstamp &&
1323 !tp->rx_opt.cookie_out_never &&
1324 (sysctl_tcp_cookie_size > 0 ||
1325 (tp->cookie_values != NULL &&
1326 tp->cookie_values->cookie_desired > 0))) {
1328 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1329 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1331 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1332 goto drop_and_release;
1334 /* Secret recipe starts with IP addresses */
1335 *mess++ ^= (__force u32)daddr;
1336 *mess++ ^= (__force u32)saddr;
1338 /* plus variable length Initiator Cookie */
1341 *c++ ^= *hash_location++;
1343 want_cookie = false; /* not our kind of cookie */
1344 tmp_ext.cookie_out_never = 0; /* false */
1345 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1346 } else if (!tp->rx_opt.cookie_in_always) {
1347 /* redundant indications, but ensure initialization. */
1348 tmp_ext.cookie_out_never = 1; /* true */
1349 tmp_ext.cookie_plus = 0;
1351 goto drop_and_release;
1353 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1355 if (want_cookie && !tmp_opt.saw_tstamp)
1356 tcp_clear_options(&tmp_opt);
1358 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1359 tcp_openreq_init(req, &tmp_opt, skb);
1361 ireq = inet_rsk(req);
1362 ireq->loc_addr = daddr;
1363 ireq->rmt_addr = saddr;
1364 ireq->no_srccheck = inet_sk(sk)->transparent;
1365 ireq->opt = tcp_v4_save_options(sk, skb);
1367 if (security_inet_conn_request(sk, skb, req))
1370 if (!want_cookie || tmp_opt.tstamp_ok)
1371 TCP_ECN_create_request(req, skb);
1374 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1375 req->cookie_ts = tmp_opt.tstamp_ok;
1377 struct inet_peer *peer = NULL;
1380 /* VJ's idea. We save last timestamp seen
1381 * from the destination in peer table, when entering
1382 * state TIME-WAIT, and check against it before
1383 * accepting new connection request.
1385 * If "isn" is not zero, this request hit alive
1386 * timewait bucket, so that all the necessary checks
1387 * are made in the function processing timewait state.
1389 if (tmp_opt.saw_tstamp &&
1390 tcp_death_row.sysctl_tw_recycle &&
1391 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1392 fl4.daddr == saddr &&
1393 (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1394 inet_peer_refcheck(peer);
1395 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1396 (s32)(peer->tcp_ts - req->ts_recent) >
1398 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1399 goto drop_and_release;
1402 /* Kill the following clause, if you dislike this way. */
1403 else if (!sysctl_tcp_syncookies &&
1404 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1405 (sysctl_max_syn_backlog >> 2)) &&
1406 (!peer || !peer->tcp_ts_stamp) &&
1407 (!dst || !dst_metric(dst, RTAX_RTT))) {
1408 /* Without syncookies last quarter of
1409 * backlog is filled with destinations,
1410 * proven to be alive.
1411 * It means that we continue to communicate
1412 * to destinations, already remembered
1413 * to the moment of synflood.
1415 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1416 &saddr, ntohs(tcp_hdr(skb)->source));
1417 goto drop_and_release;
1420 isn = tcp_v4_init_sequence(skb);
1422 tcp_rsk(req)->snt_isn = isn;
1423 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1425 if (tcp_v4_send_synack(sk, dst, req,
1426 (struct request_values *)&tmp_ext,
1427 skb_get_queue_mapping(skb)) ||
1431 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1441 EXPORT_SYMBOL(tcp_v4_conn_request);
1445 * The three way handshake has completed - we got a valid synack -
1446 * now create the new socket.
1448 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1449 struct request_sock *req,
1450 struct dst_entry *dst)
1452 struct inet_request_sock *ireq;
1453 struct inet_sock *newinet;
1454 struct tcp_sock *newtp;
1456 #ifdef CONFIG_TCP_MD5SIG
1457 struct tcp_md5sig_key *key;
1459 struct ip_options_rcu *inet_opt;
1461 if (sk_acceptq_is_full(sk))
1464 newsk = tcp_create_openreq_child(sk, req, skb);
1468 newsk->sk_gso_type = SKB_GSO_TCPV4;
1470 newtp = tcp_sk(newsk);
1471 newinet = inet_sk(newsk);
1472 ireq = inet_rsk(req);
1473 newinet->inet_daddr = ireq->rmt_addr;
1474 newinet->inet_rcv_saddr = ireq->loc_addr;
1475 newinet->inet_saddr = ireq->loc_addr;
1476 inet_opt = ireq->opt;
1477 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1479 newinet->mc_index = inet_iif(skb);
1480 newinet->mc_ttl = ip_hdr(skb)->ttl;
1481 newinet->rcv_tos = ip_hdr(skb)->tos;
1482 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1484 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1485 newinet->inet_id = newtp->write_seq ^ jiffies;
1488 dst = inet_csk_route_child_sock(sk, newsk, req);
1492 /* syncookie case : see end of cookie_v4_check() */
1494 sk_setup_caps(newsk, dst);
1496 tcp_mtup_init(newsk);
1497 tcp_sync_mss(newsk, dst_mtu(dst));
1498 newtp->advmss = dst_metric_advmss(dst);
1499 if (tcp_sk(sk)->rx_opt.user_mss &&
1500 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1501 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1503 tcp_initialize_rcv_mss(newsk);
1504 if (tcp_rsk(req)->snt_synack)
1505 tcp_valid_rtt_meas(newsk,
1506 tcp_time_stamp - tcp_rsk(req)->snt_synack);
1507 newtp->total_retrans = req->retrans;
1509 #ifdef CONFIG_TCP_MD5SIG
1510 /* Copy over the MD5 key from the original socket */
1511 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1515 * We're using one, so create a matching key
1516 * on the newsk structure. If we fail to get
1517 * memory, then we end up not copying the key
1520 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1521 AF_INET, key->key, key->keylen, GFP_ATOMIC);
1522 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1526 if (__inet_inherit_port(sk, newsk) < 0)
1528 __inet_hash_nolisten(newsk, NULL);
1533 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1537 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1540 tcp_clear_xmit_timers(newsk);
1541 tcp_cleanup_congestion_control(newsk);
1542 bh_unlock_sock(newsk);
1546 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1548 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1550 struct tcphdr *th = tcp_hdr(skb);
1551 const struct iphdr *iph = ip_hdr(skb);
1553 struct request_sock **prev;
1554 /* Find possible connection requests. */
1555 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1556 iph->saddr, iph->daddr);
1558 return tcp_check_req(sk, skb, req, prev);
1560 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1561 th->source, iph->daddr, th->dest, inet_iif(skb));
1564 if (nsk->sk_state != TCP_TIME_WAIT) {
1568 inet_twsk_put(inet_twsk(nsk));
1572 #ifdef CONFIG_SYN_COOKIES
1574 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1579 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1581 const struct iphdr *iph = ip_hdr(skb);
1583 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1584 if (!tcp_v4_check(skb->len, iph->saddr,
1585 iph->daddr, skb->csum)) {
1586 skb->ip_summed = CHECKSUM_UNNECESSARY;
1591 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1592 skb->len, IPPROTO_TCP, 0);
1594 if (skb->len <= 76) {
1595 return __skb_checksum_complete(skb);
1601 /* The socket must have it's spinlock held when we get
1604 * We have a potential double-lock case here, so even when
1605 * doing backlog processing we use the BH locking scheme.
1606 * This is because we cannot sleep with the original spinlock
1609 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1612 #ifdef CONFIG_TCP_MD5SIG
1614 * We really want to reject the packet as early as possible
1616 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1617 * o There is an MD5 option and we're not expecting one
1619 if (tcp_v4_inbound_md5_hash(sk, skb))
1623 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1624 sock_rps_save_rxhash(sk, skb);
1625 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1632 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1635 if (sk->sk_state == TCP_LISTEN) {
1636 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1641 sock_rps_save_rxhash(nsk, skb);
1642 if (tcp_child_process(sk, nsk, skb)) {
1649 sock_rps_save_rxhash(sk, skb);
1651 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1658 tcp_v4_send_reset(rsk, skb);
1661 /* Be careful here. If this function gets more complicated and
1662 * gcc suffers from register pressure on the x86, sk (in %ebx)
1663 * might be destroyed here. This current version compiles correctly,
1664 * but you have been warned.
1669 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1672 EXPORT_SYMBOL(tcp_v4_do_rcv);
1678 int tcp_v4_rcv(struct sk_buff *skb)
1680 const struct iphdr *iph;
1681 const struct tcphdr *th;
1684 struct net *net = dev_net(skb->dev);
1686 if (skb->pkt_type != PACKET_HOST)
1689 /* Count it even if it's bad */
1690 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1692 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1697 if (th->doff < sizeof(struct tcphdr) / 4)
1699 if (!pskb_may_pull(skb, th->doff * 4))
1702 /* An explanation is required here, I think.
1703 * Packet length and doff are validated by header prediction,
1704 * provided case of th->doff==0 is eliminated.
1705 * So, we defer the checks. */
1706 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1711 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1712 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1713 skb->len - th->doff * 4);
1714 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1715 TCP_SKB_CB(skb)->when = 0;
1716 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1717 TCP_SKB_CB(skb)->sacked = 0;
1719 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1724 if (sk->sk_state == TCP_TIME_WAIT)
1727 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1728 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1729 goto discard_and_relse;
1732 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1733 goto discard_and_relse;
1736 if (sk_filter(sk, skb))
1737 goto discard_and_relse;
1741 bh_lock_sock_nested(sk);
1743 if (!sock_owned_by_user(sk)) {
1744 #ifdef CONFIG_NET_DMA
1745 struct tcp_sock *tp = tcp_sk(sk);
1746 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1747 tp->ucopy.dma_chan = net_dma_find_channel();
1748 if (tp->ucopy.dma_chan)
1749 ret = tcp_v4_do_rcv(sk, skb);
1753 if (!tcp_prequeue(sk, skb))
1754 ret = tcp_v4_do_rcv(sk, skb);
1756 } else if (unlikely(sk_add_backlog(sk, skb,
1757 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1759 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1760 goto discard_and_relse;
1769 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1772 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1774 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1776 tcp_v4_send_reset(NULL, skb);
1780 /* Discard frame. */
1789 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1790 inet_twsk_put(inet_twsk(sk));
1794 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1795 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1796 inet_twsk_put(inet_twsk(sk));
1799 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1801 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1803 iph->daddr, th->dest,
1806 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1807 inet_twsk_put(inet_twsk(sk));
1811 /* Fall through to ACK */
1814 tcp_v4_timewait_ack(sk, skb);
1818 case TCP_TW_SUCCESS:;
1823 struct inet_peer *tcp_v4_get_peer(struct sock *sk)
1825 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1826 struct inet_sock *inet = inet_sk(sk);
1828 /* If we don't have a valid cached route, or we're doing IP
1829 * options which make the IPv4 header destination address
1830 * different from our peer's, do not bother with this.
1832 if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr)
1834 return rt_get_peer_create(rt, inet->inet_daddr);
1836 EXPORT_SYMBOL(tcp_v4_get_peer);
1838 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1839 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1840 .twsk_unique = tcp_twsk_unique,
1841 .twsk_destructor= tcp_twsk_destructor,
1844 const struct inet_connection_sock_af_ops ipv4_specific = {
1845 .queue_xmit = ip_queue_xmit,
1846 .send_check = tcp_v4_send_check,
1847 .rebuild_header = inet_sk_rebuild_header,
1848 .conn_request = tcp_v4_conn_request,
1849 .syn_recv_sock = tcp_v4_syn_recv_sock,
1850 .get_peer = tcp_v4_get_peer,
1851 .net_header_len = sizeof(struct iphdr),
1852 .setsockopt = ip_setsockopt,
1853 .getsockopt = ip_getsockopt,
1854 .addr2sockaddr = inet_csk_addr2sockaddr,
1855 .sockaddr_len = sizeof(struct sockaddr_in),
1856 .bind_conflict = inet_csk_bind_conflict,
1857 #ifdef CONFIG_COMPAT
1858 .compat_setsockopt = compat_ip_setsockopt,
1859 .compat_getsockopt = compat_ip_getsockopt,
1862 EXPORT_SYMBOL(ipv4_specific);
1864 #ifdef CONFIG_TCP_MD5SIG
1865 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1866 .md5_lookup = tcp_v4_md5_lookup,
1867 .calc_md5_hash = tcp_v4_md5_hash_skb,
1868 .md5_parse = tcp_v4_parse_md5_keys,
1872 /* NOTE: A lot of things set to zero explicitly by call to
1873 * sk_alloc() so need not be done here.
1875 static int tcp_v4_init_sock(struct sock *sk)
1877 struct inet_connection_sock *icsk = inet_csk(sk);
1881 icsk->icsk_af_ops = &ipv4_specific;
1883 #ifdef CONFIG_TCP_MD5SIG
1884 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1890 void tcp_v4_destroy_sock(struct sock *sk)
1892 struct tcp_sock *tp = tcp_sk(sk);
1894 tcp_clear_xmit_timers(sk);
1896 tcp_cleanup_congestion_control(sk);
1898 /* Cleanup up the write buffer. */
1899 tcp_write_queue_purge(sk);
1901 /* Cleans up our, hopefully empty, out_of_order_queue. */
1902 __skb_queue_purge(&tp->out_of_order_queue);
1904 #ifdef CONFIG_TCP_MD5SIG
1905 /* Clean up the MD5 key list, if any */
1906 if (tp->md5sig_info) {
1907 tcp_clear_md5_list(sk);
1908 kfree_rcu(tp->md5sig_info, rcu);
1909 tp->md5sig_info = NULL;
1913 #ifdef CONFIG_NET_DMA
1914 /* Cleans up our sk_async_wait_queue */
1915 __skb_queue_purge(&sk->sk_async_wait_queue);
1918 /* Clean prequeue, it must be empty really */
1919 __skb_queue_purge(&tp->ucopy.prequeue);
1921 /* Clean up a referenced TCP bind bucket. */
1922 if (inet_csk(sk)->icsk_bind_hash)
1926 * If sendmsg cached page exists, toss it.
1928 if (sk->sk_sndmsg_page) {
1929 __free_page(sk->sk_sndmsg_page);
1930 sk->sk_sndmsg_page = NULL;
1933 /* TCP Cookie Transactions */
1934 if (tp->cookie_values != NULL) {
1935 kref_put(&tp->cookie_values->kref,
1936 tcp_cookie_values_release);
1937 tp->cookie_values = NULL;
1940 sk_sockets_allocated_dec(sk);
1941 sock_release_memcg(sk);
1943 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1945 #ifdef CONFIG_PROC_FS
1946 /* Proc filesystem TCP sock list dumping. */
1948 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1950 return hlist_nulls_empty(head) ? NULL :
1951 list_entry(head->first, struct inet_timewait_sock, tw_node);
1954 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1956 return !is_a_nulls(tw->tw_node.next) ?
1957 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1961 * Get next listener socket follow cur. If cur is NULL, get first socket
1962 * starting from bucket given in st->bucket; when st->bucket is zero the
1963 * very first socket in the hash table is returned.
1965 static void *listening_get_next(struct seq_file *seq, void *cur)
1967 struct inet_connection_sock *icsk;
1968 struct hlist_nulls_node *node;
1969 struct sock *sk = cur;
1970 struct inet_listen_hashbucket *ilb;
1971 struct tcp_iter_state *st = seq->private;
1972 struct net *net = seq_file_net(seq);
1975 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976 spin_lock_bh(&ilb->lock);
1977 sk = sk_nulls_head(&ilb->head);
1981 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1985 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1986 struct request_sock *req = cur;
1988 icsk = inet_csk(st->syn_wait_sk);
1992 if (req->rsk_ops->family == st->family) {
1998 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2001 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2003 sk = sk_nulls_next(st->syn_wait_sk);
2004 st->state = TCP_SEQ_STATE_LISTENING;
2005 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007 icsk = inet_csk(sk);
2008 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2009 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2011 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012 sk = sk_nulls_next(sk);
2015 sk_nulls_for_each_from(sk, node) {
2016 if (!net_eq(sock_net(sk), net))
2018 if (sk->sk_family == st->family) {
2022 icsk = inet_csk(sk);
2023 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2026 st->uid = sock_i_uid(sk);
2027 st->syn_wait_sk = sk;
2028 st->state = TCP_SEQ_STATE_OPENREQ;
2032 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034 spin_unlock_bh(&ilb->lock);
2036 if (++st->bucket < INET_LHTABLE_SIZE) {
2037 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2038 spin_lock_bh(&ilb->lock);
2039 sk = sk_nulls_head(&ilb->head);
2047 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2049 struct tcp_iter_state *st = seq->private;
2054 rc = listening_get_next(seq, NULL);
2056 while (rc && *pos) {
2057 rc = listening_get_next(seq, rc);
2063 static inline bool empty_bucket(struct tcp_iter_state *st)
2065 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2066 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2070 * Get first established socket starting from bucket given in st->bucket.
2071 * If st->bucket is zero, the very first socket in the hash is returned.
2073 static void *established_get_first(struct seq_file *seq)
2075 struct tcp_iter_state *st = seq->private;
2076 struct net *net = seq_file_net(seq);
2080 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2082 struct hlist_nulls_node *node;
2083 struct inet_timewait_sock *tw;
2084 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2086 /* Lockless fast path for the common case of empty buckets */
2087 if (empty_bucket(st))
2091 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2092 if (sk->sk_family != st->family ||
2093 !net_eq(sock_net(sk), net)) {
2099 st->state = TCP_SEQ_STATE_TIME_WAIT;
2100 inet_twsk_for_each(tw, node,
2101 &tcp_hashinfo.ehash[st->bucket].twchain) {
2102 if (tw->tw_family != st->family ||
2103 !net_eq(twsk_net(tw), net)) {
2109 spin_unlock_bh(lock);
2110 st->state = TCP_SEQ_STATE_ESTABLISHED;
2116 static void *established_get_next(struct seq_file *seq, void *cur)
2118 struct sock *sk = cur;
2119 struct inet_timewait_sock *tw;
2120 struct hlist_nulls_node *node;
2121 struct tcp_iter_state *st = seq->private;
2122 struct net *net = seq_file_net(seq);
2127 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2131 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2138 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2139 st->state = TCP_SEQ_STATE_ESTABLISHED;
2141 /* Look for next non empty bucket */
2143 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2146 if (st->bucket > tcp_hashinfo.ehash_mask)
2149 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2150 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2152 sk = sk_nulls_next(sk);
2154 sk_nulls_for_each_from(sk, node) {
2155 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2159 st->state = TCP_SEQ_STATE_TIME_WAIT;
2160 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2168 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2170 struct tcp_iter_state *st = seq->private;
2174 rc = established_get_first(seq);
2177 rc = established_get_next(seq, rc);
2183 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2186 struct tcp_iter_state *st = seq->private;
2188 st->state = TCP_SEQ_STATE_LISTENING;
2189 rc = listening_get_idx(seq, &pos);
2192 st->state = TCP_SEQ_STATE_ESTABLISHED;
2193 rc = established_get_idx(seq, pos);
2199 static void *tcp_seek_last_pos(struct seq_file *seq)
2201 struct tcp_iter_state *st = seq->private;
2202 int offset = st->offset;
2203 int orig_num = st->num;
2206 switch (st->state) {
2207 case TCP_SEQ_STATE_OPENREQ:
2208 case TCP_SEQ_STATE_LISTENING:
2209 if (st->bucket >= INET_LHTABLE_SIZE)
2211 st->state = TCP_SEQ_STATE_LISTENING;
2212 rc = listening_get_next(seq, NULL);
2213 while (offset-- && rc)
2214 rc = listening_get_next(seq, rc);
2219 case TCP_SEQ_STATE_ESTABLISHED:
2220 case TCP_SEQ_STATE_TIME_WAIT:
2221 st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 if (st->bucket > tcp_hashinfo.ehash_mask)
2224 rc = established_get_first(seq);
2225 while (offset-- && rc)
2226 rc = established_get_next(seq, rc);
2234 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2236 struct tcp_iter_state *st = seq->private;
2239 if (*pos && *pos == st->last_pos) {
2240 rc = tcp_seek_last_pos(seq);
2245 st->state = TCP_SEQ_STATE_LISTENING;
2249 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2252 st->last_pos = *pos;
2256 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2258 struct tcp_iter_state *st = seq->private;
2261 if (v == SEQ_START_TOKEN) {
2262 rc = tcp_get_idx(seq, 0);
2266 switch (st->state) {
2267 case TCP_SEQ_STATE_OPENREQ:
2268 case TCP_SEQ_STATE_LISTENING:
2269 rc = listening_get_next(seq, v);
2271 st->state = TCP_SEQ_STATE_ESTABLISHED;
2274 rc = established_get_first(seq);
2277 case TCP_SEQ_STATE_ESTABLISHED:
2278 case TCP_SEQ_STATE_TIME_WAIT:
2279 rc = established_get_next(seq, v);
2284 st->last_pos = *pos;
2288 static void tcp_seq_stop(struct seq_file *seq, void *v)
2290 struct tcp_iter_state *st = seq->private;
2292 switch (st->state) {
2293 case TCP_SEQ_STATE_OPENREQ:
2295 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2296 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2298 case TCP_SEQ_STATE_LISTENING:
2299 if (v != SEQ_START_TOKEN)
2300 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2302 case TCP_SEQ_STATE_TIME_WAIT:
2303 case TCP_SEQ_STATE_ESTABLISHED:
2305 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2310 int tcp_seq_open(struct inode *inode, struct file *file)
2312 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2313 struct tcp_iter_state *s;
2316 err = seq_open_net(inode, file, &afinfo->seq_ops,
2317 sizeof(struct tcp_iter_state));
2321 s = ((struct seq_file *)file->private_data)->private;
2322 s->family = afinfo->family;
2326 EXPORT_SYMBOL(tcp_seq_open);
2328 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2331 struct proc_dir_entry *p;
2333 afinfo->seq_ops.start = tcp_seq_start;
2334 afinfo->seq_ops.next = tcp_seq_next;
2335 afinfo->seq_ops.stop = tcp_seq_stop;
2337 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2338 afinfo->seq_fops, afinfo);
2343 EXPORT_SYMBOL(tcp_proc_register);
2345 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2347 proc_net_remove(net, afinfo->name);
2349 EXPORT_SYMBOL(tcp_proc_unregister);
2351 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2352 struct seq_file *f, int i, int uid, int *len)
2354 const struct inet_request_sock *ireq = inet_rsk(req);
2355 int ttd = req->expires - jiffies;
2357 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2358 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2361 ntohs(inet_sk(sk)->inet_sport),
2363 ntohs(ireq->rmt_port),
2365 0, 0, /* could print option size, but that is af dependent. */
2366 1, /* timers active (only the expire timer) */
2367 jiffies_to_clock_t(ttd),
2370 0, /* non standard timer */
2371 0, /* open_requests have no inode */
2372 atomic_read(&sk->sk_refcnt),
2377 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2380 unsigned long timer_expires;
2381 const struct tcp_sock *tp = tcp_sk(sk);
2382 const struct inet_connection_sock *icsk = inet_csk(sk);
2383 const struct inet_sock *inet = inet_sk(sk);
2384 __be32 dest = inet->inet_daddr;
2385 __be32 src = inet->inet_rcv_saddr;
2386 __u16 destp = ntohs(inet->inet_dport);
2387 __u16 srcp = ntohs(inet->inet_sport);
2390 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2392 timer_expires = icsk->icsk_timeout;
2393 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2395 timer_expires = icsk->icsk_timeout;
2396 } else if (timer_pending(&sk->sk_timer)) {
2398 timer_expires = sk->sk_timer.expires;
2401 timer_expires = jiffies;
2404 if (sk->sk_state == TCP_LISTEN)
2405 rx_queue = sk->sk_ack_backlog;
2408 * because we dont lock socket, we might find a transient negative value
2410 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2412 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2413 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2414 i, src, srcp, dest, destp, sk->sk_state,
2415 tp->write_seq - tp->snd_una,
2418 jiffies_to_clock_t(timer_expires - jiffies),
2419 icsk->icsk_retransmits,
2421 icsk->icsk_probes_out,
2423 atomic_read(&sk->sk_refcnt), sk,
2424 jiffies_to_clock_t(icsk->icsk_rto),
2425 jiffies_to_clock_t(icsk->icsk_ack.ato),
2426 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2428 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2432 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2433 struct seq_file *f, int i, int *len)
2437 int ttd = tw->tw_ttd - jiffies;
2442 dest = tw->tw_daddr;
2443 src = tw->tw_rcv_saddr;
2444 destp = ntohs(tw->tw_dport);
2445 srcp = ntohs(tw->tw_sport);
2447 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2448 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2449 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2450 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2451 atomic_read(&tw->tw_refcnt), tw, len);
2456 static int tcp4_seq_show(struct seq_file *seq, void *v)
2458 struct tcp_iter_state *st;
2461 if (v == SEQ_START_TOKEN) {
2462 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2463 " sl local_address rem_address st tx_queue "
2464 "rx_queue tr tm->when retrnsmt uid timeout "
2470 switch (st->state) {
2471 case TCP_SEQ_STATE_LISTENING:
2472 case TCP_SEQ_STATE_ESTABLISHED:
2473 get_tcp4_sock(v, seq, st->num, &len);
2475 case TCP_SEQ_STATE_OPENREQ:
2476 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2478 case TCP_SEQ_STATE_TIME_WAIT:
2479 get_timewait4_sock(v, seq, st->num, &len);
2482 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2487 static const struct file_operations tcp_afinfo_seq_fops = {
2488 .owner = THIS_MODULE,
2489 .open = tcp_seq_open,
2491 .llseek = seq_lseek,
2492 .release = seq_release_net
2495 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2498 .seq_fops = &tcp_afinfo_seq_fops,
2500 .show = tcp4_seq_show,
2504 static int __net_init tcp4_proc_init_net(struct net *net)
2506 return tcp_proc_register(net, &tcp4_seq_afinfo);
2509 static void __net_exit tcp4_proc_exit_net(struct net *net)
2511 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2514 static struct pernet_operations tcp4_net_ops = {
2515 .init = tcp4_proc_init_net,
2516 .exit = tcp4_proc_exit_net,
2519 int __init tcp4_proc_init(void)
2521 return register_pernet_subsys(&tcp4_net_ops);
2524 void tcp4_proc_exit(void)
2526 unregister_pernet_subsys(&tcp4_net_ops);
2528 #endif /* CONFIG_PROC_FS */
2530 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2532 const struct iphdr *iph = skb_gro_network_header(skb);
2534 switch (skb->ip_summed) {
2535 case CHECKSUM_COMPLETE:
2536 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2538 skb->ip_summed = CHECKSUM_UNNECESSARY;
2544 NAPI_GRO_CB(skb)->flush = 1;
2548 return tcp_gro_receive(head, skb);
2551 int tcp4_gro_complete(struct sk_buff *skb)
2553 const struct iphdr *iph = ip_hdr(skb);
2554 struct tcphdr *th = tcp_hdr(skb);
2556 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2557 iph->saddr, iph->daddr, 0);
2558 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2560 return tcp_gro_complete(skb);
2563 struct proto tcp_prot = {
2565 .owner = THIS_MODULE,
2567 .connect = tcp_v4_connect,
2568 .disconnect = tcp_disconnect,
2569 .accept = inet_csk_accept,
2571 .init = tcp_v4_init_sock,
2572 .destroy = tcp_v4_destroy_sock,
2573 .shutdown = tcp_shutdown,
2574 .setsockopt = tcp_setsockopt,
2575 .getsockopt = tcp_getsockopt,
2576 .recvmsg = tcp_recvmsg,
2577 .sendmsg = tcp_sendmsg,
2578 .sendpage = tcp_sendpage,
2579 .backlog_rcv = tcp_v4_do_rcv,
2581 .unhash = inet_unhash,
2582 .get_port = inet_csk_get_port,
2583 .enter_memory_pressure = tcp_enter_memory_pressure,
2584 .sockets_allocated = &tcp_sockets_allocated,
2585 .orphan_count = &tcp_orphan_count,
2586 .memory_allocated = &tcp_memory_allocated,
2587 .memory_pressure = &tcp_memory_pressure,
2588 .sysctl_wmem = sysctl_tcp_wmem,
2589 .sysctl_rmem = sysctl_tcp_rmem,
2590 .max_header = MAX_TCP_HEADER,
2591 .obj_size = sizeof(struct tcp_sock),
2592 .slab_flags = SLAB_DESTROY_BY_RCU,
2593 .twsk_prot = &tcp_timewait_sock_ops,
2594 .rsk_prot = &tcp_request_sock_ops,
2595 .h.hashinfo = &tcp_hashinfo,
2596 .no_autobind = true,
2597 #ifdef CONFIG_COMPAT
2598 .compat_setsockopt = compat_tcp_setsockopt,
2599 .compat_getsockopt = compat_tcp_getsockopt,
2601 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2602 .init_cgroup = tcp_init_cgroup,
2603 .destroy_cgroup = tcp_destroy_cgroup,
2604 .proto_cgroup = tcp_proto_cgroup,
2607 EXPORT_SYMBOL(tcp_prot);
2609 static int __net_init tcp_sk_init(struct net *net)
2611 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2612 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2615 static void __net_exit tcp_sk_exit(struct net *net)
2617 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2620 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2622 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2625 static struct pernet_operations __net_initdata tcp_sk_ops = {
2626 .init = tcp_sk_init,
2627 .exit = tcp_sk_exit,
2628 .exit_batch = tcp_sk_exit_batch,
2631 void __init tcp_v4_init(void)
2633 inet_hashinfo_init(&tcp_hashinfo);
2634 if (register_pernet_subsys(&tcp_sk_ops))
2635 panic("Failed to create the TCP control socket.\n");