net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_tw_reuse __read_mostly;
  88 int sysctl_tcp_low_latency __read_mostly;
  89
  90 #ifdef CONFIG_TCP_MD5SIG
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93 #endif
  94
  95 struct inet_hashinfo tcp_hashinfo;
  96 EXPORT_SYMBOL(tcp_hashinfo);
  97
  98 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
  99 {
 100         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 101                                           ip_hdr(skb)->saddr,
 102                                           tcp_hdr(skb)->dest,
 103                                           tcp_hdr(skb)->source);
 104 }
 105
 106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107 {
 108         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 109         struct tcp_sock *tp = tcp_sk(sk);
 110
 111         /* With PAWS, it is safe from the viewpoint
 112            of data integrity. Even without PAWS it is safe provided sequence
 113            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 114
 115            Actually, the idea is close to VJ's one, only timestamp cache is
 116            held not per host, but per port pair and TW bucket is used as state
 117            holder.
 118
 119            If TW bucket has been already destroyed we fall back to VJ's scheme
 120            and use initial timestamp retrieved from peer table.
 121          */
 122         if (tcptw->tw_ts_recent_stamp &&
 123             (!twp || (sysctl_tcp_tw_reuse &&
 124                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 125                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 126                 if (tp->write_seq == 0)
 127                         tp->write_seq = 1;
 128                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 129                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 130                 sock_hold(sktw);
 131                 return 1;
 132         }
 133
 134         return 0;
 135 }
 136 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 137
 138 /* This will initiate an outgoing connection. */
 139 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 140 {
 141         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 142         struct inet_sock *inet = inet_sk(sk);
 143         struct tcp_sock *tp = tcp_sk(sk);
 144         __be16 orig_sport, orig_dport;
 145         __be32 daddr, nexthop;
 146         struct flowi4 *fl4;
 147         struct rtable *rt;
 148         int err;
 149         struct ip_options_rcu *inet_opt;
 150
 151         if (addr_len < sizeof(struct sockaddr_in))
 152                 return -EINVAL;
 153
 154         if (usin->sin_family != AF_INET)
 155                 return -EAFNOSUPPORT;
 156
 157         nexthop = daddr = usin->sin_addr.s_addr;
 158         inet_opt = rcu_dereference_protected(inet->inet_opt,
 159                                              lockdep_sock_is_held(sk));
 160         if (inet_opt && inet_opt->opt.srr) {
 161                 if (!daddr)
 162                         return -EINVAL;
 163                 nexthop = inet_opt->opt.faddr;
 164         }
 165
 166         orig_sport = inet->inet_sport;
 167         orig_dport = usin->sin_port;
 168         fl4 = &inet->cork.fl.u.ip4;
 169         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 170                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 171                               IPPROTO_TCP,
 172                               orig_sport, orig_dport, sk);
 173         if (IS_ERR(rt)) {
 174                 err = PTR_ERR(rt);
 175                 if (err == -ENETUNREACH)
 176                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 177                 return err;
 178         }
 179
 180         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 181                 ip_rt_put(rt);
 182                 return -ENETUNREACH;
 183         }
 184
 185         if (!inet_opt || !inet_opt->opt.srr)
 186                 daddr = fl4->daddr;
 187
 188         if (!inet->inet_saddr)
 189                 inet->inet_saddr = fl4->saddr;
 190         sk_rcv_saddr_set(sk, inet->inet_saddr);
 191
 192         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 193                 /* Reset inherited state */
 194                 tp->rx_opt.ts_recent       = 0;
 195                 tp->rx_opt.ts_recent_stamp = 0;
 196                 if (likely(!tp->repair))
 197                         tp->write_seq      = 0;
 198         }
 199
 200         if (tcp_death_row.sysctl_tw_recycle &&
 201             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 202                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 203
 204         inet->inet_dport = usin->sin_port;
 205         sk_daddr_set(sk, daddr);
 206
 207         inet_csk(sk)->icsk_ext_hdr_len = 0;
 208         if (inet_opt)
 209                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 210
 211         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 212
 213         /* Socket identity is still unknown (sport may be zero).
 214          * However we set state to SYN-SENT and not releasing socket
 215          * lock select source port, enter ourselves into the hash tables and
 216          * complete initialization after this.
 217          */
 218         tcp_set_state(sk, TCP_SYN_SENT);
 219         err = inet_hash_connect(&tcp_death_row, sk);
 220         if (err)
 221                 goto failure;
 222
 223         sk_set_txhash(sk);
 224
 225         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 226                                inet->inet_sport, inet->inet_dport, sk);
 227         if (IS_ERR(rt)) {
 228                 err = PTR_ERR(rt);
 229                 rt = NULL;
 230                 goto failure;
 231         }
 232         /* OK, now commit destination to socket.  */
 233         sk->sk_gso_type = SKB_GSO_TCPV4;
 234         sk_setup_caps(sk, &rt->dst);
 235
 236         if (!tp->write_seq && likely(!tp->repair))
 237                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 238                                                            inet->inet_daddr,
 239                                                            inet->inet_sport,
 240                                                            usin->sin_port);
 241
 242         inet->inet_id = tp->write_seq ^ jiffies;
 243
 244         err = tcp_connect(sk);
 245
 246         rt = NULL;
 247         if (err)
 248                 goto failure;
 249
 250         return 0;
 251
 252 failure:
 253         /*
 254          * This unhashes the socket and releases the local port,
 255          * if necessary.
 256          */
 257         tcp_set_state(sk, TCP_CLOSE);
 258         ip_rt_put(rt);
 259         sk->sk_route_caps = 0;
 260         inet->inet_dport = 0;
 261         return err;
 262 }
 263 EXPORT_SYMBOL(tcp_v4_connect);
 264
 265 /*
 266  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 267  * It can be called through tcp_release_cb() if socket was owned by user
 268  * at the time tcp_v4_err() was called to handle ICMP message.
 269  */
 270 void tcp_v4_mtu_reduced(struct sock *sk)
 271 {
 272         struct dst_entry *dst;
 273         struct inet_sock *inet = inet_sk(sk);
 274         u32 mtu = tcp_sk(sk)->mtu_info;
 275
 276         dst = inet_csk_update_pmtu(sk, mtu);
 277         if (!dst)
 278                 return;
 279
 280         /* Something is about to be wrong... Remember soft error
 281          * for the case, if this connection will not able to recover.
 282          */
 283         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 284                 sk->sk_err_soft = EMSGSIZE;
 285
 286         mtu = dst_mtu(dst);
 287
 288         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 289             ip_sk_accept_pmtu(sk) &&
 290             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 291                 tcp_sync_mss(sk, mtu);
 292
 293                 /* Resend the TCP packet because it's
 294                  * clear that the old packet has been
 295                  * dropped. This is the new "fast" path mtu
 296                  * discovery.
 297                  */
 298                 tcp_simple_retransmit(sk);
 299         } /* else let the usual retransmit timer handle it */
 300 }
 301 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 302
 303 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 304 {
 305         struct dst_entry *dst = __sk_dst_check(sk, 0);
 306
 307         if (dst)
 308                 dst->ops->redirect(dst, sk, skb);
 309 }
 310
 311
 312 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 313 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 314 {
 315         struct request_sock *req = inet_reqsk(sk);
 316         struct net *net = sock_net(sk);
 317
 318         /* ICMPs are not backlogged, hence we cannot get
 319          * an established socket here.
 320          */
 321         if (seq != tcp_rsk(req)->snt_isn) {
 322                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 323         } else if (abort) {
 324                 /*
 325                  * Still in SYN_RECV, just remove it silently.
 326                  * There is no good way to pass the error to the newly
 327                  * created socket, and POSIX does not want network
 328                  * errors returned from accept().
 329                  */
 330                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 331                 tcp_listendrop(req->rsk_listener);
 332         }
 333         reqsk_put(req);
 334 }
 335 EXPORT_SYMBOL(tcp_req_err);
 336
 337 /*
 338  * This routine is called by the ICMP module when it gets some
 339  * sort of error condition.  If err < 0 then the socket should
 340  * be closed and the error returned to the user.  If err > 0
 341  * it's just the icmp type << 8 | icmp code.  After adjustment
 342  * header points to the first 8 bytes of the tcp header.  We need
 343  * to find the appropriate port.
 344  *
 345  * The locking strategy used here is very "optimistic". When
 346  * someone else accesses the socket the ICMP is just dropped
 347  * and for some paths there is no check at all.
 348  * A more general error queue to queue errors for later handling
 349  * is probably better.
 350  *
 351  */
 352
 353 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 354 {
 355         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 356         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 357         struct inet_connection_sock *icsk;
 358         struct tcp_sock *tp;
 359         struct inet_sock *inet;
 360         const int type = icmp_hdr(icmp_skb)->type;
 361         const int code = icmp_hdr(icmp_skb)->code;
 362         struct sock *sk;
 363         struct sk_buff *skb;
 364         struct request_sock *fastopen;
 365         __u32 seq, snd_una;
 366         __u32 remaining;
 367         int err;
 368         struct net *net = dev_net(icmp_skb->dev);
 369
 370         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 371                                        th->dest, iph->saddr, ntohs(th->source),
 372                                        inet_iif(icmp_skb));
 373         if (!sk) {
 374                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 375                 return;
 376         }
 377         if (sk->sk_state == TCP_TIME_WAIT) {
 378                 inet_twsk_put(inet_twsk(sk));
 379                 return;
 380         }
 381         seq = ntohl(th->seq);
 382         if (sk->sk_state == TCP_NEW_SYN_RECV)
 383                 return tcp_req_err(sk, seq,
 384                                   type == ICMP_PARAMETERPROB ||
 385                                   type == ICMP_TIME_EXCEEDED ||
 386                                   (type == ICMP_DEST_UNREACH &&
 387                                    (code == ICMP_NET_UNREACH ||
 388                                     code == ICMP_HOST_UNREACH)));
 389
 390         bh_lock_sock(sk);
 391         /* If too many ICMPs get dropped on busy
 392          * servers this needs to be solved differently.
 393          * We do take care of PMTU discovery (RFC1191) special case :
 394          * we can receive locally generated ICMP messages while socket is held.
 395          */
 396         if (sock_owned_by_user(sk)) {
 397                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 398                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 399         }
 400         if (sk->sk_state == TCP_CLOSE)
 401                 goto out;
 402
 403         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 404                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 405                 goto out;
 406         }
 407
 408         icsk = inet_csk(sk);
 409         tp = tcp_sk(sk);
 410         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 411         fastopen = tp->fastopen_rsk;
 412         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 413         if (sk->sk_state != TCP_LISTEN &&
 414             !between(seq, snd_una, tp->snd_nxt)) {
 415                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 416                 goto out;
 417         }
 418
 419         switch (type) {
 420         case ICMP_REDIRECT:
 421                 do_redirect(icmp_skb, sk);
 422                 goto out;
 423         case ICMP_SOURCE_QUENCH:
 424                 /* Just silently ignore these. */
 425                 goto out;
 426         case ICMP_PARAMETERPROB:
 427                 err = EPROTO;
 428                 break;
 429         case ICMP_DEST_UNREACH:
 430                 if (code > NR_ICMP_UNREACH)
 431                         goto out;
 432
 433                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 434                         /* We are not interested in TCP_LISTEN and open_requests
 435                          * (SYN-ACKs send out by Linux are always <576bytes so
 436                          * they should go through unfragmented).
 437                          */
 438                         if (sk->sk_state == TCP_LISTEN)
 439                                 goto out;
 440
 441                         tp->mtu_info = info;
 442                         if (!sock_owned_by_user(sk)) {
 443                                 tcp_v4_mtu_reduced(sk);
 444                         } else {
 445                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 446                                         sock_hold(sk);
 447                         }
 448                         goto out;
 449                 }
 450
 451                 err = icmp_err_convert[code].errno;
 452                 /* check if icmp_skb allows revert of backoff
 453                  * (see draft-zimmermann-tcp-lcd) */
 454                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 455                         break;
 456                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 457                     !icsk->icsk_backoff || fastopen)
 458                         break;
 459
 460                 if (sock_owned_by_user(sk))
 461                         break;
 462
 463                 icsk->icsk_backoff--;
 464                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 465                                                TCP_TIMEOUT_INIT;
 466                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 467
 468                 skb = tcp_write_queue_head(sk);
 469                 BUG_ON(!skb);
 470
 471                 remaining = icsk->icsk_rto -
 472                             min(icsk->icsk_rto,
 473                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 474
 475                 if (remaining) {
 476                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 477                                                   remaining, TCP_RTO_MAX);
 478                 } else {
 479                         /* RTO revert clocked out retransmission.
 480                          * Will retransmit now */
 481                         tcp_retransmit_timer(sk);
 482                 }
 483
 484                 break;
 485         case ICMP_TIME_EXCEEDED:
 486                 err = EHOSTUNREACH;
 487                 break;
 488         default:
 489                 goto out;
 490         }
 491
 492         switch (sk->sk_state) {
 493         case TCP_SYN_SENT:
 494         case TCP_SYN_RECV:
 495                 /* Only in fast or simultaneous open. If a fast open socket is
 496                  * is already accepted it is treated as a connected one below.
 497                  */
 498                 if (fastopen && !fastopen->sk)
 499                         break;
 500
 501                 if (!sock_owned_by_user(sk)) {
 502                         sk->sk_err = err;
 503
 504                         sk->sk_error_report(sk);
 505
 506                         tcp_done(sk);
 507                 } else {
 508                         sk->sk_err_soft = err;
 509                 }
 510                 goto out;
 511         }
 512
 513         /* If we've already connected we will keep trying
 514          * until we time out, or the user gives up.
 515          *
 516          * rfc1122 4.2.3.9 allows to consider as hard errors
 517          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 518          * but it is obsoleted by pmtu discovery).
 519          *
 520          * Note, that in modern internet, where routing is unreliable
 521          * and in each dark corner broken firewalls sit, sending random
 522          * errors ordered by their masters even this two messages finally lose
 523          * their original sense (even Linux sends invalid PORT_UNREACHs)
 524          *
 525          * Now we are in compliance with RFCs.
 526          *                                                      --ANK (980905)
 527          */
 528
 529         inet = inet_sk(sk);
 530         if (!sock_owned_by_user(sk) && inet->recverr) {
 531                 sk->sk_err = err;
 532                 sk->sk_error_report(sk);
 533         } else  { /* Only an error on timeout */
 534                 sk->sk_err_soft = err;
 535         }
 536
 537 out:
 538         bh_unlock_sock(sk);
 539         sock_put(sk);
 540 }
 541
 542 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 543 {
 544         struct tcphdr *th = tcp_hdr(skb);
 545
 546         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 547                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 548                 skb->csum_start = skb_transport_header(skb) - skb->head;
 549                 skb->csum_offset = offsetof(struct tcphdr, check);
 550         } else {
 551                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 552                                          csum_partial(th,
 553                                                       th->doff << 2,
 554                                                       skb->csum));
 555         }
 556 }
 557
 558 /* This routine computes an IPv4 TCP checksum. */
 559 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 560 {
 561         const struct inet_sock *inet = inet_sk(sk);
 562
 563         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 564 }
 565 EXPORT_SYMBOL(tcp_v4_send_check);
 566
 567 /*
 568  *      This routine will send an RST to the other tcp.
 569  *
 570  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 571  *                    for reset.
 572  *      Answer: if a packet caused RST, it is not for a socket
 573  *              existing in our system, if it is matched to a socket,
 574  *              it is just duplicate segment or bug in other side's TCP.
 575  *              So that we build reply only basing on parameters
 576  *              arrived with segment.
 577  *      Exception: precedence violation. We do not implement it in any case.
 578  */
 579
 580 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 581 {
 582         const struct tcphdr *th = tcp_hdr(skb);
 583         struct {
 584                 struct tcphdr th;
 585 #ifdef CONFIG_TCP_MD5SIG
 586                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 587 #endif
 588         } rep;
 589         struct ip_reply_arg arg;
 590 #ifdef CONFIG_TCP_MD5SIG
 591         struct tcp_md5sig_key *key = NULL;
 592         const __u8 *hash_location = NULL;
 593         unsigned char newhash[16];
 594         int genhash;
 595         struct sock *sk1 = NULL;
 596 #endif
 597         struct net *net;
 598
 599         /* Never send a reset in response to a reset. */
 600         if (th->rst)
 601                 return;
 602
 603         /* If sk not NULL, it means we did a successful lookup and incoming
 604          * route had to be correct. prequeue might have dropped our dst.
 605          */
 606         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 607                 return;
 608
 609         /* Swap the send and the receive. */
 610         memset(&rep, 0, sizeof(rep));
 611         rep.th.dest   = th->source;
 612         rep.th.source = th->dest;
 613         rep.th.doff   = sizeof(struct tcphdr) / 4;
 614         rep.th.rst    = 1;
 615
 616         if (th->ack) {
 617                 rep.th.seq = th->ack_seq;
 618         } else {
 619                 rep.th.ack = 1;
 620                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 621                                        skb->len - (th->doff << 2));
 622         }
 623
 624         memset(&arg, 0, sizeof(arg));
 625         arg.iov[0].iov_base = (unsigned char *)&rep;
 626         arg.iov[0].iov_len  = sizeof(rep.th);
 627
 628         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 629 #ifdef CONFIG_TCP_MD5SIG
 630         rcu_read_lock();
 631         hash_location = tcp_parse_md5sig_option(th);
 632         if (sk && sk_fullsock(sk)) {
 633                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 634                                         &ip_hdr(skb)->saddr, AF_INET);
 635         } else if (hash_location) {
 636                 /*
 637                  * active side is lost. Try to find listening socket through
 638                  * source port, and then find md5 key through listening socket.
 639                  * we are not loose security here:
 640                  * Incoming packet is checked with md5 hash with finding key,
 641                  * no RST generated if md5 hash doesn't match.
 642                  */
 643                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 644                                              ip_hdr(skb)->saddr,
 645                                              th->source, ip_hdr(skb)->daddr,
 646                                              ntohs(th->source), inet_iif(skb));
 647                 /* don't send rst if it can't find key */
 648                 if (!sk1)
 649                         goto out;
 650
 651                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 652                                         &ip_hdr(skb)->saddr, AF_INET);
 653                 if (!key)
 654                         goto out;
 655
 656
 657                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 658                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 659                         goto out;
 660
 661         }
 662
 663         if (key) {
 664                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 665                                    (TCPOPT_NOP << 16) |
 666                                    (TCPOPT_MD5SIG << 8) |
 667                                    TCPOLEN_MD5SIG);
 668                 /* Update length and the length the header thinks exists */
 669                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 670                 rep.th.doff = arg.iov[0].iov_len / 4;
 671
 672                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 673                                      key, ip_hdr(skb)->saddr,
 674                                      ip_hdr(skb)->daddr, &rep.th);
 675         }
 676 #endif
 677         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 678                                       ip_hdr(skb)->saddr, /* XXX */
 679                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 680         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 681         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 682
 683         /* When socket is gone, all binding information is lost.
 684          * routing might fail in this case. No choice here, if we choose to force
 685          * input interface, we will misroute in case of asymmetric route.
 686          */
 687         if (sk)
 688                 arg.bound_dev_if = sk->sk_bound_dev_if;
 689
 690         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 691                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 692
 693         arg.tos = ip_hdr(skb)->tos;
 694         local_bh_disable();
 695         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 696                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 697                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 698                               &arg, arg.iov[0].iov_len);
 699
 700         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 701         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 702         local_bh_enable();
 703
 704 #ifdef CONFIG_TCP_MD5SIG
 705 out:
 706         rcu_read_unlock();
 707 #endif
 708 }
 709
 710 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 711    outside socket context is ugly, certainly. What can I do?
 712  */
 713
 714 static void tcp_v4_send_ack(struct net *net,
 715                             struct sk_buff *skb, u32 seq, u32 ack,
 716                             u32 win, u32 tsval, u32 tsecr, int oif,
 717                             struct tcp_md5sig_key *key,
 718                             int reply_flags, u8 tos)
 719 {
 720         const struct tcphdr *th = tcp_hdr(skb);
 721         struct {
 722                 struct tcphdr th;
 723                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 724 #ifdef CONFIG_TCP_MD5SIG
 725                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 726 #endif
 727                         ];
 728         } rep;
 729         struct ip_reply_arg arg;
 730
 731         memset(&rep.th, 0, sizeof(struct tcphdr));
 732         memset(&arg, 0, sizeof(arg));
 733
 734         arg.iov[0].iov_base = (unsigned char *)&rep;
 735         arg.iov[0].iov_len  = sizeof(rep.th);
 736         if (tsecr) {
 737                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 738                                    (TCPOPT_TIMESTAMP << 8) |
 739                                    TCPOLEN_TIMESTAMP);
 740                 rep.opt[1] = htonl(tsval);
 741                 rep.opt[2] = htonl(tsecr);
 742                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 743         }
 744
 745         /* Swap the send and the receive. */
 746         rep.th.dest    = th->source;
 747         rep.th.source  = th->dest;
 748         rep.th.doff    = arg.iov[0].iov_len / 4;
 749         rep.th.seq     = htonl(seq);
 750         rep.th.ack_seq = htonl(ack);
 751         rep.th.ack     = 1;
 752         rep.th.window  = htons(win);
 753
 754 #ifdef CONFIG_TCP_MD5SIG
 755         if (key) {
 756                 int offset = (tsecr) ? 3 : 0;
 757
 758                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 759                                           (TCPOPT_NOP << 16) |
 760                                           (TCPOPT_MD5SIG << 8) |
 761                                           TCPOLEN_MD5SIG);
 762                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 763                 rep.th.doff = arg.iov[0].iov_len/4;
 764
 765                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 766                                     key, ip_hdr(skb)->saddr,
 767                                     ip_hdr(skb)->daddr, &rep.th);
 768         }
 769 #endif
 770         arg.flags = reply_flags;
 771         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 772                                       ip_hdr(skb)->saddr, /* XXX */
 773                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 774         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 775         if (oif)
 776                 arg.bound_dev_if = oif;
 777         arg.tos = tos;
 778         local_bh_disable();
 779         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 780                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 781                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 782                               &arg, arg.iov[0].iov_len);
 783
 784         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 785         local_bh_enable();
 786 }
 787
 788 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 789 {
 790         struct inet_timewait_sock *tw = inet_twsk(sk);
 791         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 792
 793         tcp_v4_send_ack(sock_net(sk), skb,
 794                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 795                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 796                         tcp_time_stamp + tcptw->tw_ts_offset,
 797                         tcptw->tw_ts_recent,
 798                         tw->tw_bound_dev_if,
 799                         tcp_twsk_md5_key(tcptw),
 800                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 801                         tw->tw_tos
 802                         );
 803
 804         inet_twsk_put(tw);
 805 }
 806
 807 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 808                                   struct request_sock *req)
 809 {
 810         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 811          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 812          */
 813         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 814                                              tcp_sk(sk)->snd_nxt;
 815
 816         /* RFC 7323 2.3
 817          * The window field (SEG.WND) of every outgoing segment, with the
 818          * exception of <SYN> segments, MUST be right-shifted by
 819          * Rcv.Wind.Shift bits:
 820          */
 821         tcp_v4_send_ack(sock_net(sk), skb, seq,
 822                         tcp_rsk(req)->rcv_nxt,
 823                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 824                         tcp_time_stamp,
 825                         req->ts_recent,
 826                         0,
 827                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 828                                           AF_INET),
 829                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 830                         ip_hdr(skb)->tos);
 831 }
 832
 833 /*
 834  *      Send a SYN-ACK after having received a SYN.
 835  *      This still operates on a request_sock only, not on a big
 836  *      socket.
 837  */
 838 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 839                               struct flowi *fl,
 840                               struct request_sock *req,
 841                               struct tcp_fastopen_cookie *foc,
 842                               enum tcp_synack_type synack_type)
 843 {
 844         const struct inet_request_sock *ireq = inet_rsk(req);
 845         struct flowi4 fl4;
 846         int err = -1;
 847         struct sk_buff *skb;
 848
 849         /* First, grab a route. */
 850         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 851                 return -1;
 852
 853         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 854
 855         if (skb) {
 856                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 857
 858                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 859                                             ireq->ir_rmt_addr,
 860                                             ireq->opt);
 861                 err = net_xmit_eval(err);
 862         }
 863
 864         return err;
 865 }
 866
 867 /*
 868  *      IPv4 request_sock destructor.
 869  */
 870 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 871 {
 872         kfree(inet_rsk(req)->opt);
 873 }
 874
 875 #ifdef CONFIG_TCP_MD5SIG
 876 /*
 877  * RFC2385 MD5 checksumming requires a mapping of
 878  * IP address->MD5 Key.
 879  * We need to maintain these in the sk structure.
 880  */
 881
 882 /* Find the Key structure for an address.  */
 883 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 884                                          const union tcp_md5_addr *addr,
 885                                          int family)
 886 {
 887         const struct tcp_sock *tp = tcp_sk(sk);
 888         struct tcp_md5sig_key *key;
 889         unsigned int size = sizeof(struct in_addr);
 890         const struct tcp_md5sig_info *md5sig;
 891
 892         /* caller either holds rcu_read_lock() or socket lock */
 893         md5sig = rcu_dereference_check(tp->md5sig_info,
 894                                        lockdep_sock_is_held(sk));
 895         if (!md5sig)
 896                 return NULL;
 897 #if IS_ENABLED(CONFIG_IPV6)
 898         if (family == AF_INET6)
 899                 size = sizeof(struct in6_addr);
 900 #endif
 901         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 902                 if (key->family != family)
 903                         continue;
 904                 if (!memcmp(&key->addr, addr, size))
 905                         return key;
 906         }
 907         return NULL;
 908 }
 909 EXPORT_SYMBOL(tcp_md5_do_lookup);
 910
 911 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 912                                          const struct sock *addr_sk)
 913 {
 914         const union tcp_md5_addr *addr;
 915
 916         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 917         return tcp_md5_do_lookup(sk, addr, AF_INET);
 918 }
 919 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 920
 921 /* This can be called on a newly created socket, from other files */
 922 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 923                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 924 {
 925         /* Add Key to the list */
 926         struct tcp_md5sig_key *key;
 927         struct tcp_sock *tp = tcp_sk(sk);
 928         struct tcp_md5sig_info *md5sig;
 929
 930         key = tcp_md5_do_lookup(sk, addr, family);
 931         if (key) {
 932                 /* Pre-existing entry - just update that one. */
 933                 memcpy(key->key, newkey, newkeylen);
 934                 key->keylen = newkeylen;
 935                 return 0;
 936         }
 937
 938         md5sig = rcu_dereference_protected(tp->md5sig_info,
 939                                            lockdep_sock_is_held(sk));
 940         if (!md5sig) {
 941                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 942                 if (!md5sig)
 943                         return -ENOMEM;
 944
 945                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 946                 INIT_HLIST_HEAD(&md5sig->head);
 947                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 948         }
 949
 950         key = sock_kmalloc(sk, sizeof(*key), gfp);
 951         if (!key)
 952                 return -ENOMEM;
 953         if (!tcp_alloc_md5sig_pool()) {
 954                 sock_kfree_s(sk, key, sizeof(*key));
 955                 return -ENOMEM;
 956         }
 957
 958         memcpy(key->key, newkey, newkeylen);
 959         key->keylen = newkeylen;
 960         key->family = family;
 961         memcpy(&key->addr, addr,
 962                (family == AF_INET6) ? sizeof(struct in6_addr) :
 963                                       sizeof(struct in_addr));
 964         hlist_add_head_rcu(&key->node, &md5sig->head);
 965         return 0;
 966 }
 967 EXPORT_SYMBOL(tcp_md5_do_add);
 968
 969 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 970 {
 971         struct tcp_md5sig_key *key;
 972
 973         key = tcp_md5_do_lookup(sk, addr, family);
 974         if (!key)
 975                 return -ENOENT;
 976         hlist_del_rcu(&key->node);
 977         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 978         kfree_rcu(key, rcu);
 979         return 0;
 980 }
 981 EXPORT_SYMBOL(tcp_md5_do_del);
 982
 983 static void tcp_clear_md5_list(struct sock *sk)
 984 {
 985         struct tcp_sock *tp = tcp_sk(sk);
 986         struct tcp_md5sig_key *key;
 987         struct hlist_node *n;
 988         struct tcp_md5sig_info *md5sig;
 989
 990         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 991
 992         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 993                 hlist_del_rcu(&key->node);
 994                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 995                 kfree_rcu(key, rcu);
 996         }
 997 }
 998
 999 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1000                                  int optlen)
1001 {
1002         struct tcp_md5sig cmd;
1003         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1004
1005         if (optlen < sizeof(cmd))
1006                 return -EINVAL;
1007
1008         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1009                 return -EFAULT;
1010
1011         if (sin->sin_family != AF_INET)
1012                 return -EINVAL;
1013
1014         if (!cmd.tcpm_keylen)
1015                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1016                                       AF_INET);
1017
1018         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1019                 return -EINVAL;
1020
1021         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1022                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1023                               GFP_KERNEL);
1024 }
1025
1026 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1027                                    __be32 daddr, __be32 saddr,
1028                                    const struct tcphdr *th, int nbytes)
1029 {
1030         struct tcp4_pseudohdr *bp;
1031         struct scatterlist sg;
1032         struct tcphdr *_th;
1033
1034         bp = hp->scratch;
1035         bp->saddr = saddr;
1036         bp->daddr = daddr;
1037         bp->pad = 0;
1038         bp->protocol = IPPROTO_TCP;
1039         bp->len = cpu_to_be16(nbytes);
1040
1041         _th = (struct tcphdr *)(bp + 1);
1042         memcpy(_th, th, sizeof(*th));
1043         _th->check = 0;
1044
1045         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1046         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1047                                 sizeof(*bp) + sizeof(*th));
1048         return crypto_ahash_update(hp->md5_req);
1049 }
1050
1051 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1052                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1053 {
1054         struct tcp_md5sig_pool *hp;
1055         struct ahash_request *req;
1056
1057         hp = tcp_get_md5sig_pool();
1058         if (!hp)
1059                 goto clear_hash_noput;
1060         req = hp->md5_req;
1061
1062         if (crypto_ahash_init(req))
1063                 goto clear_hash;
1064         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1065                 goto clear_hash;
1066         if (tcp_md5_hash_key(hp, key))
1067                 goto clear_hash;
1068         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1069         if (crypto_ahash_final(req))
1070                 goto clear_hash;
1071
1072         tcp_put_md5sig_pool();
1073         return 0;
1074
1075 clear_hash:
1076         tcp_put_md5sig_pool();
1077 clear_hash_noput:
1078         memset(md5_hash, 0, 16);
1079         return 1;
1080 }
1081
1082 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1083                         const struct sock *sk,
1084                         const struct sk_buff *skb)
1085 {
1086         struct tcp_md5sig_pool *hp;
1087         struct ahash_request *req;
1088         const struct tcphdr *th = tcp_hdr(skb);
1089         __be32 saddr, daddr;
1090
1091         if (sk) { /* valid for establish/request sockets */
1092                 saddr = sk->sk_rcv_saddr;
1093                 daddr = sk->sk_daddr;
1094         } else {
1095                 const struct iphdr *iph = ip_hdr(skb);
1096                 saddr = iph->saddr;
1097                 daddr = iph->daddr;
1098         }
1099
1100         hp = tcp_get_md5sig_pool();
1101         if (!hp)
1102                 goto clear_hash_noput;
1103         req = hp->md5_req;
1104
1105         if (crypto_ahash_init(req))
1106                 goto clear_hash;
1107
1108         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1109                 goto clear_hash;
1110         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1111                 goto clear_hash;
1112         if (tcp_md5_hash_key(hp, key))
1113                 goto clear_hash;
1114         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1115         if (crypto_ahash_final(req))
1116                 goto clear_hash;
1117
1118         tcp_put_md5sig_pool();
1119         return 0;
1120
1121 clear_hash:
1122         tcp_put_md5sig_pool();
1123 clear_hash_noput:
1124         memset(md5_hash, 0, 16);
1125         return 1;
1126 }
1127 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1128
1129 #endif
1130
1131 /* Called with rcu_read_lock() */
1132 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1133                                     const struct sk_buff *skb)
1134 {
1135 #ifdef CONFIG_TCP_MD5SIG
1136         /*
1137          * This gets called for each TCP segment that arrives
1138          * so we want to be efficient.
1139          * We have 3 drop cases:
1140          * o No MD5 hash and one expected.
1141          * o MD5 hash and we're not expecting one.
1142          * o MD5 hash and its wrong.
1143          */
1144         const __u8 *hash_location = NULL;
1145         struct tcp_md5sig_key *hash_expected;
1146         const struct iphdr *iph = ip_hdr(skb);
1147         const struct tcphdr *th = tcp_hdr(skb);
1148         int genhash;
1149         unsigned char newhash[16];
1150
1151         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1152                                           AF_INET);
1153         hash_location = tcp_parse_md5sig_option(th);
1154
1155         /* We've parsed the options - do we have a hash? */
1156         if (!hash_expected && !hash_location)
1157                 return false;
1158
1159         if (hash_expected && !hash_location) {
1160                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1161                 return true;
1162         }
1163
1164         if (!hash_expected && hash_location) {
1165                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1166                 return true;
1167         }
1168
1169         /* Okay, so this is hash_expected and hash_location -
1170          * so we need to calculate the checksum.
1171          */
1172         genhash = tcp_v4_md5_hash_skb(newhash,
1173                                       hash_expected,
1174                                       NULL, skb);
1175
1176         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1177                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1178                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1179                                      &iph->saddr, ntohs(th->source),
1180                                      &iph->daddr, ntohs(th->dest),
1181                                      genhash ? " tcp_v4_calc_md5_hash failed"
1182                                      : "");
1183                 return true;
1184         }
1185         return false;
1186 #endif
1187         return false;
1188 }
1189
1190 static void tcp_v4_init_req(struct request_sock *req,
1191                             const struct sock *sk_listener,
1192                             struct sk_buff *skb)
1193 {
1194         struct inet_request_sock *ireq = inet_rsk(req);
1195
1196         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1197         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1198         ireq->opt = tcp_v4_save_options(skb);
1199 }
1200
1201 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1202                                           struct flowi *fl,
1203                                           const struct request_sock *req,
1204                                           bool *strict)
1205 {
1206         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1207
1208         if (strict) {
1209                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1210                         *strict = true;
1211                 else
1212                         *strict = false;
1213         }
1214
1215         return dst;
1216 }
1217
1218 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1219         .family         =       PF_INET,
1220         .obj_size       =       sizeof(struct tcp_request_sock),
1221         .rtx_syn_ack    =       tcp_rtx_synack,
1222         .send_ack       =       tcp_v4_reqsk_send_ack,
1223         .destructor     =       tcp_v4_reqsk_destructor,
1224         .send_reset     =       tcp_v4_send_reset,
1225         .syn_ack_timeout =      tcp_syn_ack_timeout,
1226 };
1227
1228 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1229         .mss_clamp      =       TCP_MSS_DEFAULT,
1230 #ifdef CONFIG_TCP_MD5SIG
1231         .req_md5_lookup =       tcp_v4_md5_lookup,
1232         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1233 #endif
1234         .init_req       =       tcp_v4_init_req,
1235 #ifdef CONFIG_SYN_COOKIES
1236         .cookie_init_seq =      cookie_v4_init_sequence,
1237 #endif
1238         .route_req      =       tcp_v4_route_req,
1239         .init_seq       =       tcp_v4_init_sequence,
1240         .send_synack    =       tcp_v4_send_synack,
1241 };
1242
1243 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1244 {
1245         /* Never answer to SYNs send to broadcast or multicast */
1246         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1247                 goto drop;
1248
1249         return tcp_conn_request(&tcp_request_sock_ops,
1250                                 &tcp_request_sock_ipv4_ops, sk, skb);
1251
1252 drop:
1253         tcp_listendrop(sk);
1254         return 0;
1255 }
1256 EXPORT_SYMBOL(tcp_v4_conn_request);
1257
1258
1259 /*
1260  * The three way handshake has completed - we got a valid synack -
1261  * now create the new socket.
1262  */
1263 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1264                                   struct request_sock *req,
1265                                   struct dst_entry *dst,
1266                                   struct request_sock *req_unhash,
1267                                   bool *own_req)
1268 {
1269         struct inet_request_sock *ireq;
1270         struct inet_sock *newinet;
1271         struct tcp_sock *newtp;
1272         struct sock *newsk;
1273 #ifdef CONFIG_TCP_MD5SIG
1274         struct tcp_md5sig_key *key;
1275 #endif
1276         struct ip_options_rcu *inet_opt;
1277
1278         if (sk_acceptq_is_full(sk))
1279                 goto exit_overflow;
1280
1281         newsk = tcp_create_openreq_child(sk, req, skb);
1282         if (!newsk)
1283                 goto exit_nonewsk;
1284
1285         newsk->sk_gso_type = SKB_GSO_TCPV4;
1286         inet_sk_rx_dst_set(newsk, skb);
1287
1288         newtp                 = tcp_sk(newsk);
1289         newinet               = inet_sk(newsk);
1290         ireq                  = inet_rsk(req);
1291         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1292         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1293         newsk->sk_bound_dev_if = ireq->ir_iif;
1294         newinet->inet_saddr           = ireq->ir_loc_addr;
1295         inet_opt              = ireq->opt;
1296         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1297         ireq->opt             = NULL;
1298         newinet->mc_index     = inet_iif(skb);
1299         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1300         newinet->rcv_tos      = ip_hdr(skb)->tos;
1301         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1302         if (inet_opt)
1303                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1304         newinet->inet_id = newtp->write_seq ^ jiffies;
1305
1306         if (!dst) {
1307                 dst = inet_csk_route_child_sock(sk, newsk, req);
1308                 if (!dst)
1309                         goto put_and_exit;
1310         } else {
1311                 /* syncookie case : see end of cookie_v4_check() */
1312         }
1313         sk_setup_caps(newsk, dst);
1314
1315         tcp_ca_openreq_child(newsk, dst);
1316
1317         tcp_sync_mss(newsk, dst_mtu(dst));
1318         newtp->advmss = dst_metric_advmss(dst);
1319         if (tcp_sk(sk)->rx_opt.user_mss &&
1320             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1321                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1322
1323         tcp_initialize_rcv_mss(newsk);
1324
1325 #ifdef CONFIG_TCP_MD5SIG
1326         /* Copy over the MD5 key from the original socket */
1327         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                 AF_INET);
1329         if (key) {
1330                 /*
1331                  * We're using one, so create a matching key
1332                  * on the newsk structure. If we fail to get
1333                  * memory, then we end up not copying the key
1334                  * across. Shucks.
1335                  */
1336                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339         }
1340 #endif
1341
1342         if (__inet_inherit_port(sk, newsk) < 0)
1343                 goto put_and_exit;
1344         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345         if (*own_req)
1346                 tcp_move_syn(newtp, req);
1347
1348         return newsk;
1349
1350 exit_overflow:
1351         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1352 exit_nonewsk:
1353         dst_release(dst);
1354 exit:
1355         tcp_listendrop(sk);
1356         return NULL;
1357 put_and_exit:
1358         inet_csk_prepare_forced_close(newsk);
1359         tcp_done(newsk);
1360         goto exit;
1361 }
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1363
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1365 {
1366 #ifdef CONFIG_SYN_COOKIES
1367         const struct tcphdr *th = tcp_hdr(skb);
1368
1369         if (!th->syn)
1370                 sk = cookie_v4_check(sk, skb);
1371 #endif
1372         return sk;
1373 }
1374
1375 /* The socket must have it's spinlock held when we get
1376  * here, unless it is a TCP_LISTEN socket.
1377  *
1378  * We have a potential double-lock case here, so even when
1379  * doing backlog processing we use the BH locking scheme.
1380  * This is because we cannot sleep with the original spinlock
1381  * held.
1382  */
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1384 {
1385         struct sock *rsk;
1386
1387         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388                 struct dst_entry *dst = sk->sk_rx_dst;
1389
1390                 sock_rps_save_rxhash(sk, skb);
1391                 sk_mark_napi_id(sk, skb);
1392                 if (dst) {
1393                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394                             !dst->ops->check(dst, 0)) {
1395                                 dst_release(dst);
1396                                 sk->sk_rx_dst = NULL;
1397                         }
1398                 }
1399                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1400                 return 0;
1401         }
1402
1403         if (tcp_checksum_complete(skb))
1404                 goto csum_err;
1405
1406         if (sk->sk_state == TCP_LISTEN) {
1407                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1408
1409                 if (!nsk)
1410                         goto discard;
1411                 if (nsk != sk) {
1412                         sock_rps_save_rxhash(nsk, skb);
1413                         sk_mark_napi_id(nsk, skb);
1414                         if (tcp_child_process(sk, nsk, skb)) {
1415                                 rsk = nsk;
1416                                 goto reset;
1417                         }
1418                         return 0;
1419                 }
1420         } else
1421                 sock_rps_save_rxhash(sk, skb);
1422
1423         if (tcp_rcv_state_process(sk, skb)) {
1424                 rsk = sk;
1425                 goto reset;
1426         }
1427         return 0;
1428
1429 reset:
1430         tcp_v4_send_reset(rsk, skb);
1431 discard:
1432         kfree_skb(skb);
1433         /* Be careful here. If this function gets more complicated and
1434          * gcc suffers from register pressure on the x86, sk (in %ebx)
1435          * might be destroyed here. This current version compiles correctly,
1436          * but you have been warned.
1437          */
1438         return 0;
1439
1440 csum_err:
1441         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1442         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1443         goto discard;
1444 }
1445 EXPORT_SYMBOL(tcp_v4_do_rcv);
1446
1447 void tcp_v4_early_demux(struct sk_buff *skb)
1448 {
1449         const struct iphdr *iph;
1450         const struct tcphdr *th;
1451         struct sock *sk;
1452
1453         if (skb->pkt_type != PACKET_HOST)
1454                 return;
1455
1456         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1457                 return;
1458
1459         iph = ip_hdr(skb);
1460         th = tcp_hdr(skb);
1461
1462         if (th->doff < sizeof(struct tcphdr) / 4)
1463                 return;
1464
1465         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1466                                        iph->saddr, th->source,
1467                                        iph->daddr, ntohs(th->dest),
1468                                        skb->skb_iif);
1469         if (sk) {
1470                 skb->sk = sk;
1471                 skb->destructor = sock_edemux;
1472                 if (sk_fullsock(sk)) {
1473                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1474
1475                         if (dst)
1476                                 dst = dst_check(dst, 0);
1477                         if (dst &&
1478                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1479                                 skb_dst_set_noref(skb, dst);
1480                 }
1481         }
1482 }
1483
1484 /* Packet is added to VJ-style prequeue for processing in process
1485  * context, if a reader task is waiting. Apparently, this exciting
1486  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1487  * failed somewhere. Latency? Burstiness? Well, at least now we will
1488  * see, why it failed. 8)8)                               --ANK
1489  *
1490  */
1491 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1492 {
1493         struct tcp_sock *tp = tcp_sk(sk);
1494
1495         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1496                 return false;
1497
1498         if (skb->len <= tcp_hdrlen(skb) &&
1499             skb_queue_len(&tp->ucopy.prequeue) == 0)
1500                 return false;
1501
1502         /* Before escaping RCU protected region, we need to take care of skb
1503          * dst. Prequeue is only enabled for established sockets.
1504          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1505          * Instead of doing full sk_rx_dst validity here, let's perform
1506          * an optimistic check.
1507          */
1508         if (likely(sk->sk_rx_dst))
1509                 skb_dst_drop(skb);
1510         else
1511                 skb_dst_force_safe(skb);
1512
1513         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1514         tp->ucopy.memory += skb->truesize;
1515         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1516             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1517                 struct sk_buff *skb1;
1518
1519                 BUG_ON(sock_owned_by_user(sk));
1520                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1521                                 skb_queue_len(&tp->ucopy.prequeue));
1522
1523                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1524                         sk_backlog_rcv(sk, skb1);
1525
1526                 tp->ucopy.memory = 0;
1527         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1528                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1529                                            POLLIN | POLLRDNORM | POLLRDBAND);
1530                 if (!inet_csk_ack_scheduled(sk))
1531                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1532                                                   (3 * tcp_rto_min(sk)) / 4,
1533                                                   TCP_RTO_MAX);
1534         }
1535         return true;
1536 }
1537 EXPORT_SYMBOL(tcp_prequeue);
1538
1539 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1540 {
1541         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1542
1543         /* Only socket owner can try to collapse/prune rx queues
1544          * to reduce memory overhead, so add a little headroom here.
1545          * Few sockets backlog are possibly concurrently non empty.
1546          */
1547         limit += 64*1024;
1548
1549         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1550          * we can fix skb->truesize to its real value to avoid future drops.
1551          * This is valid because skb is not yet charged to the socket.
1552          * It has been noticed pure SACK packets were sometimes dropped
1553          * (if cooked by drivers without copybreak feature).
1554          */
1555         if (!skb->data_len)
1556                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1557
1558         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1559                 bh_unlock_sock(sk);
1560                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1561                 return true;
1562         }
1563         return false;
1564 }
1565 EXPORT_SYMBOL(tcp_add_backlog);
1566
1567 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1568 {
1569         struct tcphdr *th = (struct tcphdr *)skb->data;
1570         unsigned int eaten = skb->len;
1571         int err;
1572
1573         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1574         if (!err) {
1575                 eaten -= skb->len;
1576                 TCP_SKB_CB(skb)->end_seq -= eaten;
1577         }
1578         return err;
1579 }
1580 EXPORT_SYMBOL(tcp_filter);
1581
1582 /*
1583  *      From tcp_input.c
1584  */
1585
1586 int tcp_v4_rcv(struct sk_buff *skb)
1587 {
1588         struct net *net = dev_net(skb->dev);
1589         const struct iphdr *iph;
1590         const struct tcphdr *th;
1591         bool refcounted;
1592         struct sock *sk;
1593         int ret;
1594
1595         if (skb->pkt_type != PACKET_HOST)
1596                 goto discard_it;
1597
1598         /* Count it even if it's bad */
1599         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1600
1601         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1602                 goto discard_it;
1603
1604         th = (const struct tcphdr *)skb->data;
1605
1606         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1607                 goto bad_packet;
1608         if (!pskb_may_pull(skb, th->doff * 4))
1609                 goto discard_it;
1610
1611         /* An explanation is required here, I think.
1612          * Packet length and doff are validated by header prediction,
1613          * provided case of th->doff==0 is eliminated.
1614          * So, we defer the checks. */
1615
1616         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1617                 goto csum_error;
1618
1619         th = (const struct tcphdr *)skb->data;
1620         iph = ip_hdr(skb);
1621         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1622          * barrier() makes sure compiler wont play fool^Waliasing games.
1623          */
1624         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1625                 sizeof(struct inet_skb_parm));
1626         barrier();
1627
1628         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1629         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1630                                     skb->len - th->doff * 4);
1631         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1632         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1633         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1634         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1635         TCP_SKB_CB(skb)->sacked  = 0;
1636
1637 lookup:
1638         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1639                                th->dest, &refcounted);
1640         if (!sk)
1641                 goto no_tcp_socket;
1642
1643 process:
1644         if (sk->sk_state == TCP_TIME_WAIT)
1645                 goto do_time_wait;
1646
1647         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1648                 struct request_sock *req = inet_reqsk(sk);
1649                 struct sock *nsk;
1650
1651                 sk = req->rsk_listener;
1652                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1653                         sk_drops_add(sk, skb);
1654                         reqsk_put(req);
1655                         goto discard_it;
1656                 }
1657                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1658                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1659                         goto lookup;
1660                 }
1661                 /* We own a reference on the listener, increase it again
1662                  * as we might lose it too soon.
1663                  */
1664                 sock_hold(sk);
1665                 refcounted = true;
1666                 nsk = tcp_check_req(sk, skb, req, false);
1667                 if (!nsk) {
1668                         reqsk_put(req);
1669                         goto discard_and_relse;
1670                 }
1671                 if (nsk == sk) {
1672                         reqsk_put(req);
1673                 } else if (tcp_child_process(sk, nsk, skb)) {
1674                         tcp_v4_send_reset(nsk, skb);
1675                         goto discard_and_relse;
1676                 } else {
1677                         sock_put(sk);
1678                         return 0;
1679                 }
1680         }
1681         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1682                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1683                 goto discard_and_relse;
1684         }
1685
1686         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1687                 goto discard_and_relse;
1688
1689         if (tcp_v4_inbound_md5_hash(sk, skb))
1690                 goto discard_and_relse;
1691
1692         nf_reset(skb);
1693
1694         if (tcp_filter(sk, skb))
1695                 goto discard_and_relse;
1696         th = (const struct tcphdr *)skb->data;
1697         iph = ip_hdr(skb);
1698
1699         skb->dev = NULL;
1700
1701         if (sk->sk_state == TCP_LISTEN) {
1702                 ret = tcp_v4_do_rcv(sk, skb);
1703                 goto put_and_return;
1704         }
1705
1706         sk_incoming_cpu_update(sk);
1707
1708         bh_lock_sock_nested(sk);
1709         tcp_segs_in(tcp_sk(sk), skb);
1710         ret = 0;
1711         if (!sock_owned_by_user(sk)) {
1712                 if (!tcp_prequeue(sk, skb))
1713                         ret = tcp_v4_do_rcv(sk, skb);
1714         } else if (tcp_add_backlog(sk, skb)) {
1715                 goto discard_and_relse;
1716         }
1717         bh_unlock_sock(sk);
1718
1719 put_and_return:
1720         if (refcounted)
1721                 sock_put(sk);
1722
1723         return ret;
1724
1725 no_tcp_socket:
1726         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1727                 goto discard_it;
1728
1729         if (tcp_checksum_complete(skb)) {
1730 csum_error:
1731                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1732 bad_packet:
1733                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1734         } else {
1735                 tcp_v4_send_reset(NULL, skb);
1736         }
1737
1738 discard_it:
1739         /* Discard frame. */
1740         kfree_skb(skb);
1741         return 0;
1742
1743 discard_and_relse:
1744         sk_drops_add(sk, skb);
1745         if (refcounted)
1746                 sock_put(sk);
1747         goto discard_it;
1748
1749 do_time_wait:
1750         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1751                 inet_twsk_put(inet_twsk(sk));
1752                 goto discard_it;
1753         }
1754
1755         if (tcp_checksum_complete(skb)) {
1756                 inet_twsk_put(inet_twsk(sk));
1757                 goto csum_error;
1758         }
1759         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1760         case TCP_TW_SYN: {
1761                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1762                                                         &tcp_hashinfo, skb,
1763                                                         __tcp_hdrlen(th),
1764                                                         iph->saddr, th->source,
1765                                                         iph->daddr, th->dest,
1766                                                         inet_iif(skb));
1767                 if (sk2) {
1768                         inet_twsk_deschedule_put(inet_twsk(sk));
1769                         sk = sk2;
1770                         refcounted = false;
1771                         goto process;
1772                 }
1773                 /* Fall through to ACK */
1774         }
1775         case TCP_TW_ACK:
1776                 tcp_v4_timewait_ack(sk, skb);
1777                 break;
1778         case TCP_TW_RST:
1779                 tcp_v4_send_reset(sk, skb);
1780                 inet_twsk_deschedule_put(inet_twsk(sk));
1781                 goto discard_it;
1782         case TCP_TW_SUCCESS:;
1783         }
1784         goto discard_it;
1785 }
1786
1787 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1788         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1789         .twsk_unique    = tcp_twsk_unique,
1790         .twsk_destructor= tcp_twsk_destructor,
1791 };
1792
1793 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1794 {
1795         struct dst_entry *dst = skb_dst(skb);
1796
1797         if (dst && dst_hold_safe(dst)) {
1798                 sk->sk_rx_dst = dst;
1799                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1800         }
1801 }
1802 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1803
1804 const struct inet_connection_sock_af_ops ipv4_specific = {
1805         .queue_xmit        = ip_queue_xmit,
1806         .send_check        = tcp_v4_send_check,
1807         .rebuild_header    = inet_sk_rebuild_header,
1808         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1809         .conn_request      = tcp_v4_conn_request,
1810         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1811         .net_header_len    = sizeof(struct iphdr),
1812         .setsockopt        = ip_setsockopt,
1813         .getsockopt        = ip_getsockopt,
1814         .addr2sockaddr     = inet_csk_addr2sockaddr,
1815         .sockaddr_len      = sizeof(struct sockaddr_in),
1816         .bind_conflict     = inet_csk_bind_conflict,
1817 #ifdef CONFIG_COMPAT
1818         .compat_setsockopt = compat_ip_setsockopt,
1819         .compat_getsockopt = compat_ip_getsockopt,
1820 #endif
1821         .mtu_reduced       = tcp_v4_mtu_reduced,
1822 };
1823 EXPORT_SYMBOL(ipv4_specific);
1824
1825 #ifdef CONFIG_TCP_MD5SIG
1826 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1827         .md5_lookup             = tcp_v4_md5_lookup,
1828         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1829         .md5_parse              = tcp_v4_parse_md5_keys,
1830 };
1831 #endif
1832
1833 /* NOTE: A lot of things set to zero explicitly by call to
1834  *       sk_alloc() so need not be done here.
1835  */
1836 static int tcp_v4_init_sock(struct sock *sk)
1837 {
1838         struct inet_connection_sock *icsk = inet_csk(sk);
1839
1840         tcp_init_sock(sk);
1841
1842         icsk->icsk_af_ops = &ipv4_specific;
1843
1844 #ifdef CONFIG_TCP_MD5SIG
1845         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1846 #endif
1847
1848         return 0;
1849 }
1850
1851 void tcp_v4_destroy_sock(struct sock *sk)
1852 {
1853         struct tcp_sock *tp = tcp_sk(sk);
1854
1855         tcp_clear_xmit_timers(sk);
1856
1857         tcp_cleanup_congestion_control(sk);
1858
1859         /* Cleanup up the write buffer. */
1860         tcp_write_queue_purge(sk);
1861
1862         /* Cleans up our, hopefully empty, out_of_order_queue. */
1863         skb_rbtree_purge(&tp->out_of_order_queue);
1864
1865 #ifdef CONFIG_TCP_MD5SIG
1866         /* Clean up the MD5 key list, if any */
1867         if (tp->md5sig_info) {
1868                 tcp_clear_md5_list(sk);
1869                 kfree_rcu(tp->md5sig_info, rcu);
1870                 tp->md5sig_info = NULL;
1871         }
1872 #endif
1873
1874         /* Clean prequeue, it must be empty really */
1875         __skb_queue_purge(&tp->ucopy.prequeue);
1876
1877         /* Clean up a referenced TCP bind bucket. */
1878         if (inet_csk(sk)->icsk_bind_hash)
1879                 inet_put_port(sk);
1880
1881         BUG_ON(tp->fastopen_rsk);
1882
1883         /* If socket is aborted during connect operation */
1884         tcp_free_fastopen_req(tp);
1885         tcp_saved_syn_free(tp);
1886
1887         local_bh_disable();
1888         sk_sockets_allocated_dec(sk);
1889         local_bh_enable();
1890 }
1891 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1892
1893 #ifdef CONFIG_PROC_FS
1894 /* Proc filesystem TCP sock list dumping. */
1895
1896 /*
1897  * Get next listener socket follow cur.  If cur is NULL, get first socket
1898  * starting from bucket given in st->bucket; when st->bucket is zero the
1899  * very first socket in the hash table is returned.
1900  */
1901 static void *listening_get_next(struct seq_file *seq, void *cur)
1902 {
1903         struct tcp_iter_state *st = seq->private;
1904         struct net *net = seq_file_net(seq);
1905         struct inet_listen_hashbucket *ilb;
1906         struct sock *sk = cur;
1907
1908         if (!sk) {
1909 get_head:
1910                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1911                 spin_lock_bh(&ilb->lock);
1912                 sk = sk_head(&ilb->head);
1913                 st->offset = 0;
1914                 goto get_sk;
1915         }
1916         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1917         ++st->num;
1918         ++st->offset;
1919
1920         sk = sk_next(sk);
1921 get_sk:
1922         sk_for_each_from(sk) {
1923                 if (!net_eq(sock_net(sk), net))
1924                         continue;
1925                 if (sk->sk_family == st->family)
1926                         return sk;
1927         }
1928         spin_unlock_bh(&ilb->lock);
1929         st->offset = 0;
1930         if (++st->bucket < INET_LHTABLE_SIZE)
1931                 goto get_head;
1932         return NULL;
1933 }
1934
1935 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1936 {
1937         struct tcp_iter_state *st = seq->private;
1938         void *rc;
1939
1940         st->bucket = 0;
1941         st->offset = 0;
1942         rc = listening_get_next(seq, NULL);
1943
1944         while (rc && *pos) {
1945                 rc = listening_get_next(seq, rc);
1946                 --*pos;
1947         }
1948         return rc;
1949 }
1950
1951 static inline bool empty_bucket(const struct tcp_iter_state *st)
1952 {
1953         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1954 }
1955
1956 /*
1957  * Get first established socket starting from bucket given in st->bucket.
1958  * If st->bucket is zero, the very first socket in the hash is returned.
1959  */
1960 static void *established_get_first(struct seq_file *seq)
1961 {
1962         struct tcp_iter_state *st = seq->private;
1963         struct net *net = seq_file_net(seq);
1964         void *rc = NULL;
1965
1966         st->offset = 0;
1967         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1968                 struct sock *sk;
1969                 struct hlist_nulls_node *node;
1970                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1971
1972                 /* Lockless fast path for the common case of empty buckets */
1973                 if (empty_bucket(st))
1974                         continue;
1975
1976                 spin_lock_bh(lock);
1977                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1978                         if (sk->sk_family != st->family ||
1979                             !net_eq(sock_net(sk), net)) {
1980                                 continue;
1981                         }
1982                         rc = sk;
1983                         goto out;
1984                 }
1985                 spin_unlock_bh(lock);
1986         }
1987 out:
1988         return rc;
1989 }
1990
1991 static void *established_get_next(struct seq_file *seq, void *cur)
1992 {
1993         struct sock *sk = cur;
1994         struct hlist_nulls_node *node;
1995         struct tcp_iter_state *st = seq->private;
1996         struct net *net = seq_file_net(seq);
1997
1998         ++st->num;
1999         ++st->offset;
2000
2001         sk = sk_nulls_next(sk);
2002
2003         sk_nulls_for_each_from(sk, node) {
2004                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2005                         return sk;
2006         }
2007
2008         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2009         ++st->bucket;
2010         return established_get_first(seq);
2011 }
2012
2013 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2014 {
2015         struct tcp_iter_state *st = seq->private;
2016         void *rc;
2017
2018         st->bucket = 0;
2019         rc = established_get_first(seq);
2020
2021         while (rc && pos) {
2022                 rc = established_get_next(seq, rc);
2023                 --pos;
2024         }
2025         return rc;
2026 }
2027
2028 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2029 {
2030         void *rc;
2031         struct tcp_iter_state *st = seq->private;
2032
2033         st->state = TCP_SEQ_STATE_LISTENING;
2034         rc        = listening_get_idx(seq, &pos);
2035
2036         if (!rc) {
2037                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2038                 rc        = established_get_idx(seq, pos);
2039         }
2040
2041         return rc;
2042 }
2043
2044 static void *tcp_seek_last_pos(struct seq_file *seq)
2045 {
2046         struct tcp_iter_state *st = seq->private;
2047         int offset = st->offset;
2048         int orig_num = st->num;
2049         void *rc = NULL;
2050
2051         switch (st->state) {
2052         case TCP_SEQ_STATE_LISTENING:
2053                 if (st->bucket >= INET_LHTABLE_SIZE)
2054                         break;
2055                 st->state = TCP_SEQ_STATE_LISTENING;
2056                 rc = listening_get_next(seq, NULL);
2057                 while (offset-- && rc)
2058                         rc = listening_get_next(seq, rc);
2059                 if (rc)
2060                         break;
2061                 st->bucket = 0;
2062                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2063                 /* Fallthrough */
2064         case TCP_SEQ_STATE_ESTABLISHED:
2065                 if (st->bucket > tcp_hashinfo.ehash_mask)
2066                         break;
2067                 rc = established_get_first(seq);
2068                 while (offset-- && rc)
2069                         rc = established_get_next(seq, rc);
2070         }
2071
2072         st->num = orig_num;
2073
2074         return rc;
2075 }
2076
2077 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2078 {
2079         struct tcp_iter_state *st = seq->private;
2080         void *rc;
2081
2082         if (*pos && *pos == st->last_pos) {
2083                 rc = tcp_seek_last_pos(seq);
2084                 if (rc)
2085                         goto out;
2086         }
2087
2088         st->state = TCP_SEQ_STATE_LISTENING;
2089         st->num = 0;
2090         st->bucket = 0;
2091         st->offset = 0;
2092         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2093
2094 out:
2095         st->last_pos = *pos;
2096         return rc;
2097 }
2098
2099 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2100 {
2101         struct tcp_iter_state *st = seq->private;
2102         void *rc = NULL;
2103
2104         if (v == SEQ_START_TOKEN) {
2105                 rc = tcp_get_idx(seq, 0);
2106                 goto out;
2107         }
2108
2109         switch (st->state) {
2110         case TCP_SEQ_STATE_LISTENING:
2111                 rc = listening_get_next(seq, v);
2112                 if (!rc) {
2113                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2114                         st->bucket = 0;
2115                         st->offset = 0;
2116                         rc        = established_get_first(seq);
2117                 }
2118                 break;
2119         case TCP_SEQ_STATE_ESTABLISHED:
2120                 rc = established_get_next(seq, v);
2121                 break;
2122         }
2123 out:
2124         ++*pos;
2125         st->last_pos = *pos;
2126         return rc;
2127 }
2128
2129 static void tcp_seq_stop(struct seq_file *seq, void *v)
2130 {
2131         struct tcp_iter_state *st = seq->private;
2132
2133         switch (st->state) {
2134         case TCP_SEQ_STATE_LISTENING:
2135                 if (v != SEQ_START_TOKEN)
2136                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2137                 break;
2138         case TCP_SEQ_STATE_ESTABLISHED:
2139                 if (v)
2140                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2141                 break;
2142         }
2143 }
2144
2145 int tcp_seq_open(struct inode *inode, struct file *file)
2146 {
2147         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2148         struct tcp_iter_state *s;
2149         int err;
2150
2151         err = seq_open_net(inode, file, &afinfo->seq_ops,
2152                           sizeof(struct tcp_iter_state));
2153         if (err < 0)
2154                 return err;
2155
2156         s = ((struct seq_file *)file->private_data)->private;
2157         s->family               = afinfo->family;
2158         s->last_pos             = 0;
2159         return 0;
2160 }
2161 EXPORT_SYMBOL(tcp_seq_open);
2162
2163 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2164 {
2165         int rc = 0;
2166         struct proc_dir_entry *p;
2167
2168         afinfo->seq_ops.start           = tcp_seq_start;
2169         afinfo->seq_ops.next            = tcp_seq_next;
2170         afinfo->seq_ops.stop            = tcp_seq_stop;
2171
2172         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2173                              afinfo->seq_fops, afinfo);
2174         if (!p)
2175                 rc = -ENOMEM;
2176         return rc;
2177 }
2178 EXPORT_SYMBOL(tcp_proc_register);
2179
2180 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2181 {
2182         remove_proc_entry(afinfo->name, net->proc_net);
2183 }
2184 EXPORT_SYMBOL(tcp_proc_unregister);
2185
2186 static void get_openreq4(const struct request_sock *req,
2187                          struct seq_file *f, int i)
2188 {
2189         const struct inet_request_sock *ireq = inet_rsk(req);
2190         long delta = req->rsk_timer.expires - jiffies;
2191
2192         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2193                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2194                 i,
2195                 ireq->ir_loc_addr,
2196                 ireq->ir_num,
2197                 ireq->ir_rmt_addr,
2198                 ntohs(ireq->ir_rmt_port),
2199                 TCP_SYN_RECV,
2200                 0, 0, /* could print option size, but that is af dependent. */
2201                 1,    /* timers active (only the expire timer) */
2202                 jiffies_delta_to_clock_t(delta),
2203                 req->num_timeout,
2204                 from_kuid_munged(seq_user_ns(f),
2205                                  sock_i_uid(req->rsk_listener)),
2206                 0,  /* non standard timer */
2207                 0, /* open_requests have no inode */
2208                 0,
2209                 req);
2210 }
2211
2212 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2213 {
2214         int timer_active;
2215         unsigned long timer_expires;
2216         const struct tcp_sock *tp = tcp_sk(sk);
2217         const struct inet_connection_sock *icsk = inet_csk(sk);
2218         const struct inet_sock *inet = inet_sk(sk);
2219         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2220         __be32 dest = inet->inet_daddr;
2221         __be32 src = inet->inet_rcv_saddr;
2222         __u16 destp = ntohs(inet->inet_dport);
2223         __u16 srcp = ntohs(inet->inet_sport);
2224         int rx_queue;
2225         int state;
2226
2227         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2228             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2229             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2230                 timer_active    = 1;
2231                 timer_expires   = icsk->icsk_timeout;
2232         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2233                 timer_active    = 4;
2234                 timer_expires   = icsk->icsk_timeout;
2235         } else if (timer_pending(&sk->sk_timer)) {
2236                 timer_active    = 2;
2237                 timer_expires   = sk->sk_timer.expires;
2238         } else {
2239                 timer_active    = 0;
2240                 timer_expires = jiffies;
2241         }
2242
2243         state = sk_state_load(sk);
2244         if (state == TCP_LISTEN)
2245                 rx_queue = sk->sk_ack_backlog;
2246         else
2247                 /* Because we don't lock the socket,
2248                  * we might find a transient negative value.
2249                  */
2250                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2251
2252         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2253                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2254                 i, src, srcp, dest, destp, state,
2255                 tp->write_seq - tp->snd_una,
2256                 rx_queue,
2257                 timer_active,
2258                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2259                 icsk->icsk_retransmits,
2260                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2261                 icsk->icsk_probes_out,
2262                 sock_i_ino(sk),
2263                 atomic_read(&sk->sk_refcnt), sk,
2264                 jiffies_to_clock_t(icsk->icsk_rto),
2265                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2266                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2267                 tp->snd_cwnd,
2268                 state == TCP_LISTEN ?
2269                     fastopenq->max_qlen :
2270                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2271 }
2272
2273 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2274                                struct seq_file *f, int i)
2275 {
2276         long delta = tw->tw_timer.expires - jiffies;
2277         __be32 dest, src;
2278         __u16 destp, srcp;
2279
2280         dest  = tw->tw_daddr;
2281         src   = tw->tw_rcv_saddr;
2282         destp = ntohs(tw->tw_dport);
2283         srcp  = ntohs(tw->tw_sport);
2284
2285         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2286                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2287                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2288                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2289                 atomic_read(&tw->tw_refcnt), tw);
2290 }
2291
2292 #define TMPSZ 150
2293
2294 static int tcp4_seq_show(struct seq_file *seq, void *v)
2295 {
2296         struct tcp_iter_state *st;
2297         struct sock *sk = v;
2298
2299         seq_setwidth(seq, TMPSZ - 1);
2300         if (v == SEQ_START_TOKEN) {
2301                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2302                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2303                            "inode");
2304                 goto out;
2305         }
2306         st = seq->private;
2307
2308         if (sk->sk_state == TCP_TIME_WAIT)
2309                 get_timewait4_sock(v, seq, st->num);
2310         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2311                 get_openreq4(v, seq, st->num);
2312         else
2313                 get_tcp4_sock(v, seq, st->num);
2314 out:
2315         seq_pad(seq, '\n');
2316         return 0;
2317 }
2318
2319 static const struct file_operations tcp_afinfo_seq_fops = {
2320         .owner   = THIS_MODULE,
2321         .open    = tcp_seq_open,
2322         .read    = seq_read,
2323         .llseek  = seq_lseek,
2324         .release = seq_release_net
2325 };
2326
2327 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2328         .name           = "tcp",
2329         .family         = AF_INET,
2330         .seq_fops       = &tcp_afinfo_seq_fops,
2331         .seq_ops        = {
2332                 .show           = tcp4_seq_show,
2333         },
2334 };
2335
2336 static int __net_init tcp4_proc_init_net(struct net *net)
2337 {
2338         return tcp_proc_register(net, &tcp4_seq_afinfo);
2339 }
2340
2341 static void __net_exit tcp4_proc_exit_net(struct net *net)
2342 {
2343         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2344 }
2345
2346 static struct pernet_operations tcp4_net_ops = {
2347         .init = tcp4_proc_init_net,
2348         .exit = tcp4_proc_exit_net,
2349 };
2350
2351 int __init tcp4_proc_init(void)
2352 {
2353         return register_pernet_subsys(&tcp4_net_ops);
2354 }
2355
2356 void tcp4_proc_exit(void)
2357 {
2358         unregister_pernet_subsys(&tcp4_net_ops);
2359 }
2360 #endif /* CONFIG_PROC_FS */
2361
2362 struct proto tcp_prot = {
2363         .name                   = "TCP",
2364         .owner                  = THIS_MODULE,
2365         .close                  = tcp_close,
2366         .connect                = tcp_v4_connect,
2367         .disconnect             = tcp_disconnect,
2368         .accept                 = inet_csk_accept,
2369         .ioctl                  = tcp_ioctl,
2370         .init                   = tcp_v4_init_sock,
2371         .destroy                = tcp_v4_destroy_sock,
2372         .shutdown               = tcp_shutdown,
2373         .setsockopt             = tcp_setsockopt,
2374         .getsockopt             = tcp_getsockopt,
2375         .recvmsg                = tcp_recvmsg,
2376         .sendmsg                = tcp_sendmsg,
2377         .sendpage               = tcp_sendpage,
2378         .backlog_rcv            = tcp_v4_do_rcv,
2379         .release_cb             = tcp_release_cb,
2380         .hash                   = inet_hash,
2381         .unhash                 = inet_unhash,
2382         .get_port               = inet_csk_get_port,
2383         .enter_memory_pressure  = tcp_enter_memory_pressure,
2384         .stream_memory_free     = tcp_stream_memory_free,
2385         .sockets_allocated      = &tcp_sockets_allocated,
2386         .orphan_count           = &tcp_orphan_count,
2387         .memory_allocated       = &tcp_memory_allocated,
2388         .memory_pressure        = &tcp_memory_pressure,
2389         .sysctl_mem             = sysctl_tcp_mem,
2390         .sysctl_wmem            = sysctl_tcp_wmem,
2391         .sysctl_rmem            = sysctl_tcp_rmem,
2392         .max_header             = MAX_TCP_HEADER,
2393         .obj_size               = sizeof(struct tcp_sock),
2394         .slab_flags             = SLAB_DESTROY_BY_RCU,
2395         .twsk_prot              = &tcp_timewait_sock_ops,
2396         .rsk_prot               = &tcp_request_sock_ops,
2397         .h.hashinfo             = &tcp_hashinfo,
2398         .no_autobind            = true,
2399 #ifdef CONFIG_COMPAT
2400         .compat_setsockopt      = compat_tcp_setsockopt,
2401         .compat_getsockopt      = compat_tcp_getsockopt,
2402 #endif
2403         .diag_destroy           = tcp_abort,
2404 };
2405 EXPORT_SYMBOL(tcp_prot);
2406
2407 static void __net_exit tcp_sk_exit(struct net *net)
2408 {
2409         int cpu;
2410
2411         for_each_possible_cpu(cpu)
2412                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2413         free_percpu(net->ipv4.tcp_sk);
2414 }
2415
2416 static int __net_init tcp_sk_init(struct net *net)
2417 {
2418         int res, cpu;
2419
2420         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2421         if (!net->ipv4.tcp_sk)
2422                 return -ENOMEM;
2423
2424         for_each_possible_cpu(cpu) {
2425                 struct sock *sk;
2426
2427                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2428                                            IPPROTO_TCP, net);
2429                 if (res)
2430                         goto fail;
2431                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2432                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2433         }
2434
2435         net->ipv4.sysctl_tcp_ecn = 2;
2436         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2437
2438         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2439         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2440         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2441
2442         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2443         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2444         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2445
2446         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2447         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2448         net->ipv4.sysctl_tcp_syncookies = 1;
2449         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2450         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2451         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2452         net->ipv4.sysctl_tcp_orphan_retries = 0;
2453         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2454         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2455
2456         return 0;
2457 fail:
2458         tcp_sk_exit(net);
2459
2460         return res;
2461 }
2462
2463 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2464 {
2465         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2466 }
2467
2468 static struct pernet_operations __net_initdata tcp_sk_ops = {
2469        .init       = tcp_sk_init,
2470        .exit       = tcp_sk_exit,
2471        .exit_batch = tcp_sk_exit_batch,
2472 };
2473
2474 void __init tcp_v4_init(void)
2475 {
2476         inet_hashinfo_init(&tcp_hashinfo);
2477         if (register_pernet_subsys(&tcp_sk_ops))
2478                 panic("Failed to create the TCP control socket.\n");
2479 }