net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_low_latency __read_mostly;
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  91                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  92 #endif
  93
  94 struct inet_hashinfo tcp_hashinfo;
  95 EXPORT_SYMBOL(tcp_hashinfo);
  96
  97 static u32 tcp_v4_init_seq_and_tsoff(const struct sk_buff *skb, u32 *tsoff)
  98 {
  99         return secure_tcp_seq_and_tsoff(ip_hdr(skb)->daddr,
 100                                         ip_hdr(skb)->saddr,
 101                                         tcp_hdr(skb)->dest,
 102                                         tcp_hdr(skb)->source, tsoff);
 103 }
 104
 105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 106 {
 107         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 108         struct tcp_sock *tp = tcp_sk(sk);
 109
 110         /* With PAWS, it is safe from the viewpoint
 111            of data integrity. Even without PAWS it is safe provided sequence
 112            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 113
 114            Actually, the idea is close to VJ's one, only timestamp cache is
 115            held not per host, but per port pair and TW bucket is used as state
 116            holder.
 117
 118            If TW bucket has been already destroyed we fall back to VJ's scheme
 119            and use initial timestamp retrieved from peer table.
 120          */
 121         if (tcptw->tw_ts_recent_stamp &&
 122             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 123                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 124                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 125                 if (tp->write_seq == 0)
 126                         tp->write_seq = 1;
 127                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 128                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 129                 sock_hold(sktw);
 130                 return 1;
 131         }
 132
 133         return 0;
 134 }
 135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 136
 137 /* This will initiate an outgoing connection. */
 138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 139 {
 140         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 141         struct inet_sock *inet = inet_sk(sk);
 142         struct tcp_sock *tp = tcp_sk(sk);
 143         __be16 orig_sport, orig_dport;
 144         __be32 daddr, nexthop;
 145         struct flowi4 *fl4;
 146         struct rtable *rt;
 147         int err;
 148         u32 seq;
 149         struct ip_options_rcu *inet_opt;
 150         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 151
 152         if (addr_len < sizeof(struct sockaddr_in))
 153                 return -EINVAL;
 154
 155         if (usin->sin_family != AF_INET)
 156                 return -EAFNOSUPPORT;
 157
 158         nexthop = daddr = usin->sin_addr.s_addr;
 159         inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                              lockdep_sock_is_held(sk));
 161         if (inet_opt && inet_opt->opt.srr) {
 162                 if (!daddr)
 163                         return -EINVAL;
 164                 nexthop = inet_opt->opt.faddr;
 165         }
 166
 167         orig_sport = inet->inet_sport;
 168         orig_dport = usin->sin_port;
 169         fl4 = &inet->cork.fl.u.ip4;
 170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               orig_sport, orig_dport, sk);
 174         if (IS_ERR(rt)) {
 175                 err = PTR_ERR(rt);
 176                 if (err == -ENETUNREACH)
 177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                 return err;
 179         }
 180
 181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                 ip_rt_put(rt);
 183                 return -ENETUNREACH;
 184         }
 185
 186         if (!inet_opt || !inet_opt->opt.srr)
 187                 daddr = fl4->daddr;
 188
 189         if (!inet->inet_saddr)
 190                 inet->inet_saddr = fl4->saddr;
 191         sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                 /* Reset inherited state */
 195                 tp->rx_opt.ts_recent       = 0;
 196                 tp->rx_opt.ts_recent_stamp = 0;
 197                 if (likely(!tp->repair))
 198                         tp->write_seq      = 0;
 199         }
 200
 201         inet->inet_dport = usin->sin_port;
 202         sk_daddr_set(sk, daddr);
 203
 204         inet_csk(sk)->icsk_ext_hdr_len = 0;
 205         if (inet_opt)
 206                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 207
 208         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 209
 210         /* Socket identity is still unknown (sport may be zero).
 211          * However we set state to SYN-SENT and not releasing socket
 212          * lock select source port, enter ourselves into the hash tables and
 213          * complete initialization after this.
 214          */
 215         tcp_set_state(sk, TCP_SYN_SENT);
 216         err = inet_hash_connect(tcp_death_row, sk);
 217         if (err)
 218                 goto failure;
 219
 220         sk_set_txhash(sk);
 221
 222         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 223                                inet->inet_sport, inet->inet_dport, sk);
 224         if (IS_ERR(rt)) {
 225                 err = PTR_ERR(rt);
 226                 rt = NULL;
 227                 goto failure;
 228         }
 229         /* OK, now commit destination to socket.  */
 230         sk->sk_gso_type = SKB_GSO_TCPV4;
 231         sk_setup_caps(sk, &rt->dst);
 232         rt = NULL;
 233
 234         if (likely(!tp->repair)) {
 235                 seq = secure_tcp_seq_and_tsoff(inet->inet_saddr,
 236                                                inet->inet_daddr,
 237                                                inet->inet_sport,
 238                                                usin->sin_port,
 239                                                &tp->tsoffset);
 240                 if (!tp->write_seq)
 241                         tp->write_seq = seq;
 242         }
 243
 244         inet->inet_id = tp->write_seq ^ jiffies;
 245
 246         if (tcp_fastopen_defer_connect(sk, &err))
 247                 return err;
 248         if (err)
 249                 goto failure;
 250
 251         err = tcp_connect(sk);
 252
 253         if (err)
 254                 goto failure;
 255
 256         return 0;
 257
 258 failure:
 259         /*
 260          * This unhashes the socket and releases the local port,
 261          * if necessary.
 262          */
 263         tcp_set_state(sk, TCP_CLOSE);
 264         ip_rt_put(rt);
 265         sk->sk_route_caps = 0;
 266         inet->inet_dport = 0;
 267         return err;
 268 }
 269 EXPORT_SYMBOL(tcp_v4_connect);
 270
 271 /*
 272  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 273  * It can be called through tcp_release_cb() if socket was owned by user
 274  * at the time tcp_v4_err() was called to handle ICMP message.
 275  */
 276 void tcp_v4_mtu_reduced(struct sock *sk)
 277 {
 278         struct inet_sock *inet = inet_sk(sk);
 279         struct dst_entry *dst;
 280         u32 mtu;
 281
 282         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 283                 return;
 284         mtu = tcp_sk(sk)->mtu_info;
 285         dst = inet_csk_update_pmtu(sk, mtu);
 286         if (!dst)
 287                 return;
 288
 289         /* Something is about to be wrong... Remember soft error
 290          * for the case, if this connection will not able to recover.
 291          */
 292         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 293                 sk->sk_err_soft = EMSGSIZE;
 294
 295         mtu = dst_mtu(dst);
 296
 297         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 298             ip_sk_accept_pmtu(sk) &&
 299             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 300                 tcp_sync_mss(sk, mtu);
 301
 302                 /* Resend the TCP packet because it's
 303                  * clear that the old packet has been
 304                  * dropped. This is the new "fast" path mtu
 305                  * discovery.
 306                  */
 307                 tcp_simple_retransmit(sk);
 308         } /* else let the usual retransmit timer handle it */
 309 }
 310 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 311
 312 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 313 {
 314         struct dst_entry *dst = __sk_dst_check(sk, 0);
 315
 316         if (dst)
 317                 dst->ops->redirect(dst, sk, skb);
 318 }
 319
 320
 321 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 322 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 323 {
 324         struct request_sock *req = inet_reqsk(sk);
 325         struct net *net = sock_net(sk);
 326
 327         /* ICMPs are not backlogged, hence we cannot get
 328          * an established socket here.
 329          */
 330         if (seq != tcp_rsk(req)->snt_isn) {
 331                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 332         } else if (abort) {
 333                 /*
 334                  * Still in SYN_RECV, just remove it silently.
 335                  * There is no good way to pass the error to the newly
 336                  * created socket, and POSIX does not want network
 337                  * errors returned from accept().
 338                  */
 339                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 340                 tcp_listendrop(req->rsk_listener);
 341         }
 342         reqsk_put(req);
 343 }
 344 EXPORT_SYMBOL(tcp_req_err);
 345
 346 /*
 347  * This routine is called by the ICMP module when it gets some
 348  * sort of error condition.  If err < 0 then the socket should
 349  * be closed and the error returned to the user.  If err > 0
 350  * it's just the icmp type << 8 | icmp code.  After adjustment
 351  * header points to the first 8 bytes of the tcp header.  We need
 352  * to find the appropriate port.
 353  *
 354  * The locking strategy used here is very "optimistic". When
 355  * someone else accesses the socket the ICMP is just dropped
 356  * and for some paths there is no check at all.
 357  * A more general error queue to queue errors for later handling
 358  * is probably better.
 359  *
 360  */
 361
 362 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 363 {
 364         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 365         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 366         struct inet_connection_sock *icsk;
 367         struct tcp_sock *tp;
 368         struct inet_sock *inet;
 369         const int type = icmp_hdr(icmp_skb)->type;
 370         const int code = icmp_hdr(icmp_skb)->code;
 371         struct sock *sk;
 372         struct sk_buff *skb;
 373         struct request_sock *fastopen;
 374         __u32 seq, snd_una;
 375         __u32 remaining;
 376         int err;
 377         struct net *net = dev_net(icmp_skb->dev);
 378
 379         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 380                                        th->dest, iph->saddr, ntohs(th->source),
 381                                        inet_iif(icmp_skb));
 382         if (!sk) {
 383                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 384                 return;
 385         }
 386         if (sk->sk_state == TCP_TIME_WAIT) {
 387                 inet_twsk_put(inet_twsk(sk));
 388                 return;
 389         }
 390         seq = ntohl(th->seq);
 391         if (sk->sk_state == TCP_NEW_SYN_RECV)
 392                 return tcp_req_err(sk, seq,
 393                                   type == ICMP_PARAMETERPROB ||
 394                                   type == ICMP_TIME_EXCEEDED ||
 395                                   (type == ICMP_DEST_UNREACH &&
 396                                    (code == ICMP_NET_UNREACH ||
 397                                     code == ICMP_HOST_UNREACH)));
 398
 399         bh_lock_sock(sk);
 400         /* If too many ICMPs get dropped on busy
 401          * servers this needs to be solved differently.
 402          * We do take care of PMTU discovery (RFC1191) special case :
 403          * we can receive locally generated ICMP messages while socket is held.
 404          */
 405         if (sock_owned_by_user(sk)) {
 406                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 407                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 408         }
 409         if (sk->sk_state == TCP_CLOSE)
 410                 goto out;
 411
 412         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 413                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 414                 goto out;
 415         }
 416
 417         icsk = inet_csk(sk);
 418         tp = tcp_sk(sk);
 419         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 420         fastopen = tp->fastopen_rsk;
 421         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 422         if (sk->sk_state != TCP_LISTEN &&
 423             !between(seq, snd_una, tp->snd_nxt)) {
 424                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 425                 goto out;
 426         }
 427
 428         switch (type) {
 429         case ICMP_REDIRECT:
 430                 if (!sock_owned_by_user(sk))
 431                         do_redirect(icmp_skb, sk);
 432                 goto out;
 433         case ICMP_SOURCE_QUENCH:
 434                 /* Just silently ignore these. */
 435                 goto out;
 436         case ICMP_PARAMETERPROB:
 437                 err = EPROTO;
 438                 break;
 439         case ICMP_DEST_UNREACH:
 440                 if (code > NR_ICMP_UNREACH)
 441                         goto out;
 442
 443                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 444                         /* We are not interested in TCP_LISTEN and open_requests
 445                          * (SYN-ACKs send out by Linux are always <576bytes so
 446                          * they should go through unfragmented).
 447                          */
 448                         if (sk->sk_state == TCP_LISTEN)
 449                                 goto out;
 450
 451                         tp->mtu_info = info;
 452                         if (!sock_owned_by_user(sk)) {
 453                                 tcp_v4_mtu_reduced(sk);
 454                         } else {
 455                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 456                                         sock_hold(sk);
 457                         }
 458                         goto out;
 459                 }
 460
 461                 err = icmp_err_convert[code].errno;
 462                 /* check if icmp_skb allows revert of backoff
 463                  * (see draft-zimmermann-tcp-lcd) */
 464                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 465                         break;
 466                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 467                     !icsk->icsk_backoff || fastopen)
 468                         break;
 469
 470                 if (sock_owned_by_user(sk))
 471                         break;
 472
 473                 icsk->icsk_backoff--;
 474                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 475                                                TCP_TIMEOUT_INIT;
 476                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 477
 478                 skb = tcp_write_queue_head(sk);
 479                 BUG_ON(!skb);
 480
 481                 remaining = icsk->icsk_rto -
 482                             min(icsk->icsk_rto,
 483                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 484
 485                 if (remaining) {
 486                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 487                                                   remaining, TCP_RTO_MAX);
 488                 } else {
 489                         /* RTO revert clocked out retransmission.
 490                          * Will retransmit now */
 491                         tcp_retransmit_timer(sk);
 492                 }
 493
 494                 break;
 495         case ICMP_TIME_EXCEEDED:
 496                 err = EHOSTUNREACH;
 497                 break;
 498         default:
 499                 goto out;
 500         }
 501
 502         switch (sk->sk_state) {
 503         case TCP_SYN_SENT:
 504         case TCP_SYN_RECV:
 505                 /* Only in fast or simultaneous open. If a fast open socket is
 506                  * is already accepted it is treated as a connected one below.
 507                  */
 508                 if (fastopen && !fastopen->sk)
 509                         break;
 510
 511                 if (!sock_owned_by_user(sk)) {
 512                         sk->sk_err = err;
 513
 514                         sk->sk_error_report(sk);
 515
 516                         tcp_done(sk);
 517                 } else {
 518                         sk->sk_err_soft = err;
 519                 }
 520                 goto out;
 521         }
 522
 523         /* If we've already connected we will keep trying
 524          * until we time out, or the user gives up.
 525          *
 526          * rfc1122 4.2.3.9 allows to consider as hard errors
 527          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 528          * but it is obsoleted by pmtu discovery).
 529          *
 530          * Note, that in modern internet, where routing is unreliable
 531          * and in each dark corner broken firewalls sit, sending random
 532          * errors ordered by their masters even this two messages finally lose
 533          * their original sense (even Linux sends invalid PORT_UNREACHs)
 534          *
 535          * Now we are in compliance with RFCs.
 536          *                                                      --ANK (980905)
 537          */
 538
 539         inet = inet_sk(sk);
 540         if (!sock_owned_by_user(sk) && inet->recverr) {
 541                 sk->sk_err = err;
 542                 sk->sk_error_report(sk);
 543         } else  { /* Only an error on timeout */
 544                 sk->sk_err_soft = err;
 545         }
 546
 547 out:
 548         bh_unlock_sock(sk);
 549         sock_put(sk);
 550 }
 551
 552 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 553 {
 554         struct tcphdr *th = tcp_hdr(skb);
 555
 556         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 557                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 558                 skb->csum_start = skb_transport_header(skb) - skb->head;
 559                 skb->csum_offset = offsetof(struct tcphdr, check);
 560         } else {
 561                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 562                                          csum_partial(th,
 563                                                       th->doff << 2,
 564                                                       skb->csum));
 565         }
 566 }
 567
 568 /* This routine computes an IPv4 TCP checksum. */
 569 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 570 {
 571         const struct inet_sock *inet = inet_sk(sk);
 572
 573         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 574 }
 575 EXPORT_SYMBOL(tcp_v4_send_check);
 576
 577 /*
 578  *      This routine will send an RST to the other tcp.
 579  *
 580  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 581  *                    for reset.
 582  *      Answer: if a packet caused RST, it is not for a socket
 583  *              existing in our system, if it is matched to a socket,
 584  *              it is just duplicate segment or bug in other side's TCP.
 585  *              So that we build reply only basing on parameters
 586  *              arrived with segment.
 587  *      Exception: precedence violation. We do not implement it in any case.
 588  */
 589
 590 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 591 {
 592         const struct tcphdr *th = tcp_hdr(skb);
 593         struct {
 594                 struct tcphdr th;
 595 #ifdef CONFIG_TCP_MD5SIG
 596                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 597 #endif
 598         } rep;
 599         struct ip_reply_arg arg;
 600 #ifdef CONFIG_TCP_MD5SIG
 601         struct tcp_md5sig_key *key = NULL;
 602         const __u8 *hash_location = NULL;
 603         unsigned char newhash[16];
 604         int genhash;
 605         struct sock *sk1 = NULL;
 606 #endif
 607         struct net *net;
 608
 609         /* Never send a reset in response to a reset. */
 610         if (th->rst)
 611                 return;
 612
 613         /* If sk not NULL, it means we did a successful lookup and incoming
 614          * route had to be correct. prequeue might have dropped our dst.
 615          */
 616         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 617                 return;
 618
 619         /* Swap the send and the receive. */
 620         memset(&rep, 0, sizeof(rep));
 621         rep.th.dest   = th->source;
 622         rep.th.source = th->dest;
 623         rep.th.doff   = sizeof(struct tcphdr) / 4;
 624         rep.th.rst    = 1;
 625
 626         if (th->ack) {
 627                 rep.th.seq = th->ack_seq;
 628         } else {
 629                 rep.th.ack = 1;
 630                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 631                                        skb->len - (th->doff << 2));
 632         }
 633
 634         memset(&arg, 0, sizeof(arg));
 635         arg.iov[0].iov_base = (unsigned char *)&rep;
 636         arg.iov[0].iov_len  = sizeof(rep.th);
 637
 638         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 639 #ifdef CONFIG_TCP_MD5SIG
 640         rcu_read_lock();
 641         hash_location = tcp_parse_md5sig_option(th);
 642         if (sk && sk_fullsock(sk)) {
 643                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 644                                         &ip_hdr(skb)->saddr, AF_INET);
 645         } else if (hash_location) {
 646                 /*
 647                  * active side is lost. Try to find listening socket through
 648                  * source port, and then find md5 key through listening socket.
 649                  * we are not loose security here:
 650                  * Incoming packet is checked with md5 hash with finding key,
 651                  * no RST generated if md5 hash doesn't match.
 652                  */
 653                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 654                                              ip_hdr(skb)->saddr,
 655                                              th->source, ip_hdr(skb)->daddr,
 656                                              ntohs(th->source), inet_iif(skb));
 657                 /* don't send rst if it can't find key */
 658                 if (!sk1)
 659                         goto out;
 660
 661                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 662                                         &ip_hdr(skb)->saddr, AF_INET);
 663                 if (!key)
 664                         goto out;
 665
 666
 667                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 668                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 669                         goto out;
 670
 671         }
 672
 673         if (key) {
 674                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 675                                    (TCPOPT_NOP << 16) |
 676                                    (TCPOPT_MD5SIG << 8) |
 677                                    TCPOLEN_MD5SIG);
 678                 /* Update length and the length the header thinks exists */
 679                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 680                 rep.th.doff = arg.iov[0].iov_len / 4;
 681
 682                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 683                                      key, ip_hdr(skb)->saddr,
 684                                      ip_hdr(skb)->daddr, &rep.th);
 685         }
 686 #endif
 687         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 688                                       ip_hdr(skb)->saddr, /* XXX */
 689                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 690         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 691         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 692
 693         /* When socket is gone, all binding information is lost.
 694          * routing might fail in this case. No choice here, if we choose to force
 695          * input interface, we will misroute in case of asymmetric route.
 696          */
 697         if (sk)
 698                 arg.bound_dev_if = sk->sk_bound_dev_if;
 699
 700         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 701                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 702
 703         arg.tos = ip_hdr(skb)->tos;
 704         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 705         local_bh_disable();
 706         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 707                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 708                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 709                               &arg, arg.iov[0].iov_len);
 710
 711         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 712         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 713         local_bh_enable();
 714
 715 #ifdef CONFIG_TCP_MD5SIG
 716 out:
 717         rcu_read_unlock();
 718 #endif
 719 }
 720
 721 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 722    outside socket context is ugly, certainly. What can I do?
 723  */
 724
 725 static void tcp_v4_send_ack(const struct sock *sk,
 726                             struct sk_buff *skb, u32 seq, u32 ack,
 727                             u32 win, u32 tsval, u32 tsecr, int oif,
 728                             struct tcp_md5sig_key *key,
 729                             int reply_flags, u8 tos)
 730 {
 731         const struct tcphdr *th = tcp_hdr(skb);
 732         struct {
 733                 struct tcphdr th;
 734                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 735 #ifdef CONFIG_TCP_MD5SIG
 736                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 737 #endif
 738                         ];
 739         } rep;
 740         struct net *net = sock_net(sk);
 741         struct ip_reply_arg arg;
 742
 743         memset(&rep.th, 0, sizeof(struct tcphdr));
 744         memset(&arg, 0, sizeof(arg));
 745
 746         arg.iov[0].iov_base = (unsigned char *)&rep;
 747         arg.iov[0].iov_len  = sizeof(rep.th);
 748         if (tsecr) {
 749                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 750                                    (TCPOPT_TIMESTAMP << 8) |
 751                                    TCPOLEN_TIMESTAMP);
 752                 rep.opt[1] = htonl(tsval);
 753                 rep.opt[2] = htonl(tsecr);
 754                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 755         }
 756
 757         /* Swap the send and the receive. */
 758         rep.th.dest    = th->source;
 759         rep.th.source  = th->dest;
 760         rep.th.doff    = arg.iov[0].iov_len / 4;
 761         rep.th.seq     = htonl(seq);
 762         rep.th.ack_seq = htonl(ack);
 763         rep.th.ack     = 1;
 764         rep.th.window  = htons(win);
 765
 766 #ifdef CONFIG_TCP_MD5SIG
 767         if (key) {
 768                 int offset = (tsecr) ? 3 : 0;
 769
 770                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 771                                           (TCPOPT_NOP << 16) |
 772                                           (TCPOPT_MD5SIG << 8) |
 773                                           TCPOLEN_MD5SIG);
 774                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 775                 rep.th.doff = arg.iov[0].iov_len/4;
 776
 777                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 778                                     key, ip_hdr(skb)->saddr,
 779                                     ip_hdr(skb)->daddr, &rep.th);
 780         }
 781 #endif
 782         arg.flags = reply_flags;
 783         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 784                                       ip_hdr(skb)->saddr, /* XXX */
 785                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 786         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 787         if (oif)
 788                 arg.bound_dev_if = oif;
 789         arg.tos = tos;
 790         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 791         local_bh_disable();
 792         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 793                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 794                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 795                               &arg, arg.iov[0].iov_len);
 796
 797         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 798         local_bh_enable();
 799 }
 800
 801 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 802 {
 803         struct inet_timewait_sock *tw = inet_twsk(sk);
 804         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 805
 806         tcp_v4_send_ack(sk, skb,
 807                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 808                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 809                         tcp_time_stamp + tcptw->tw_ts_offset,
 810                         tcptw->tw_ts_recent,
 811                         tw->tw_bound_dev_if,
 812                         tcp_twsk_md5_key(tcptw),
 813                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 814                         tw->tw_tos
 815                         );
 816
 817         inet_twsk_put(tw);
 818 }
 819
 820 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 821                                   struct request_sock *req)
 822 {
 823         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 824          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 825          */
 826         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 827                                              tcp_sk(sk)->snd_nxt;
 828
 829         /* RFC 7323 2.3
 830          * The window field (SEG.WND) of every outgoing segment, with the
 831          * exception of <SYN> segments, MUST be right-shifted by
 832          * Rcv.Wind.Shift bits:
 833          */
 834         tcp_v4_send_ack(sk, skb, seq,
 835                         tcp_rsk(req)->rcv_nxt,
 836                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 837                         tcp_time_stamp + tcp_rsk(req)->ts_off,
 838                         req->ts_recent,
 839                         0,
 840                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 841                                           AF_INET),
 842                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 843                         ip_hdr(skb)->tos);
 844 }
 845
 846 /*
 847  *      Send a SYN-ACK after having received a SYN.
 848  *      This still operates on a request_sock only, not on a big
 849  *      socket.
 850  */
 851 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 852                               struct flowi *fl,
 853                               struct request_sock *req,
 854                               struct tcp_fastopen_cookie *foc,
 855                               enum tcp_synack_type synack_type)
 856 {
 857         const struct inet_request_sock *ireq = inet_rsk(req);
 858         struct flowi4 fl4;
 859         int err = -1;
 860         struct sk_buff *skb;
 861
 862         /* First, grab a route. */
 863         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 864                 return -1;
 865
 866         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 867
 868         if (skb) {
 869                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 870
 871                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 872                                             ireq->ir_rmt_addr,
 873                                             ireq->opt);
 874                 err = net_xmit_eval(err);
 875         }
 876
 877         return err;
 878 }
 879
 880 /*
 881  *      IPv4 request_sock destructor.
 882  */
 883 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 884 {
 885         kfree(inet_rsk(req)->opt);
 886 }
 887
 888 #ifdef CONFIG_TCP_MD5SIG
 889 /*
 890  * RFC2385 MD5 checksumming requires a mapping of
 891  * IP address->MD5 Key.
 892  * We need to maintain these in the sk structure.
 893  */
 894
 895 /* Find the Key structure for an address.  */
 896 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 897                                          const union tcp_md5_addr *addr,
 898                                          int family)
 899 {
 900         const struct tcp_sock *tp = tcp_sk(sk);
 901         struct tcp_md5sig_key *key;
 902         unsigned int size = sizeof(struct in_addr);
 903         const struct tcp_md5sig_info *md5sig;
 904
 905         /* caller either holds rcu_read_lock() or socket lock */
 906         md5sig = rcu_dereference_check(tp->md5sig_info,
 907                                        lockdep_sock_is_held(sk));
 908         if (!md5sig)
 909                 return NULL;
 910 #if IS_ENABLED(CONFIG_IPV6)
 911         if (family == AF_INET6)
 912                 size = sizeof(struct in6_addr);
 913 #endif
 914         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 915                 if (key->family != family)
 916                         continue;
 917                 if (!memcmp(&key->addr, addr, size))
 918                         return key;
 919         }
 920         return NULL;
 921 }
 922 EXPORT_SYMBOL(tcp_md5_do_lookup);
 923
 924 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 925                                          const struct sock *addr_sk)
 926 {
 927         const union tcp_md5_addr *addr;
 928
 929         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 930         return tcp_md5_do_lookup(sk, addr, AF_INET);
 931 }
 932 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 933
 934 /* This can be called on a newly created socket, from other files */
 935 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 936                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 937 {
 938         /* Add Key to the list */
 939         struct tcp_md5sig_key *key;
 940         struct tcp_sock *tp = tcp_sk(sk);
 941         struct tcp_md5sig_info *md5sig;
 942
 943         key = tcp_md5_do_lookup(sk, addr, family);
 944         if (key) {
 945                 /* Pre-existing entry - just update that one. */
 946                 memcpy(key->key, newkey, newkeylen);
 947                 key->keylen = newkeylen;
 948                 return 0;
 949         }
 950
 951         md5sig = rcu_dereference_protected(tp->md5sig_info,
 952                                            lockdep_sock_is_held(sk));
 953         if (!md5sig) {
 954                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 955                 if (!md5sig)
 956                         return -ENOMEM;
 957
 958                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 959                 INIT_HLIST_HEAD(&md5sig->head);
 960                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 961         }
 962
 963         key = sock_kmalloc(sk, sizeof(*key), gfp);
 964         if (!key)
 965                 return -ENOMEM;
 966         if (!tcp_alloc_md5sig_pool()) {
 967                 sock_kfree_s(sk, key, sizeof(*key));
 968                 return -ENOMEM;
 969         }
 970
 971         memcpy(key->key, newkey, newkeylen);
 972         key->keylen = newkeylen;
 973         key->family = family;
 974         memcpy(&key->addr, addr,
 975                (family == AF_INET6) ? sizeof(struct in6_addr) :
 976                                       sizeof(struct in_addr));
 977         hlist_add_head_rcu(&key->node, &md5sig->head);
 978         return 0;
 979 }
 980 EXPORT_SYMBOL(tcp_md5_do_add);
 981
 982 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 983 {
 984         struct tcp_md5sig_key *key;
 985
 986         key = tcp_md5_do_lookup(sk, addr, family);
 987         if (!key)
 988                 return -ENOENT;
 989         hlist_del_rcu(&key->node);
 990         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 991         kfree_rcu(key, rcu);
 992         return 0;
 993 }
 994 EXPORT_SYMBOL(tcp_md5_do_del);
 995
 996 static void tcp_clear_md5_list(struct sock *sk)
 997 {
 998         struct tcp_sock *tp = tcp_sk(sk);
 999         struct tcp_md5sig_key *key;
1000         struct hlist_node *n;
1001         struct tcp_md5sig_info *md5sig;
1002
1003         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1004
1005         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1006                 hlist_del_rcu(&key->node);
1007                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1008                 kfree_rcu(key, rcu);
1009         }
1010 }
1011
1012 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1013                                  int optlen)
1014 {
1015         struct tcp_md5sig cmd;
1016         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1017
1018         if (optlen < sizeof(cmd))
1019                 return -EINVAL;
1020
1021         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1022                 return -EFAULT;
1023
1024         if (sin->sin_family != AF_INET)
1025                 return -EINVAL;
1026
1027         if (!cmd.tcpm_keylen)
1028                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029                                       AF_INET);
1030
1031         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1032                 return -EINVAL;
1033
1034         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1035                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1036                               GFP_KERNEL);
1037 }
1038
1039 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1040                                    __be32 daddr, __be32 saddr,
1041                                    const struct tcphdr *th, int nbytes)
1042 {
1043         struct tcp4_pseudohdr *bp;
1044         struct scatterlist sg;
1045         struct tcphdr *_th;
1046
1047         bp = hp->scratch;
1048         bp->saddr = saddr;
1049         bp->daddr = daddr;
1050         bp->pad = 0;
1051         bp->protocol = IPPROTO_TCP;
1052         bp->len = cpu_to_be16(nbytes);
1053
1054         _th = (struct tcphdr *)(bp + 1);
1055         memcpy(_th, th, sizeof(*th));
1056         _th->check = 0;
1057
1058         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1059         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1060                                 sizeof(*bp) + sizeof(*th));
1061         return crypto_ahash_update(hp->md5_req);
1062 }
1063
1064 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1065                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1066 {
1067         struct tcp_md5sig_pool *hp;
1068         struct ahash_request *req;
1069
1070         hp = tcp_get_md5sig_pool();
1071         if (!hp)
1072                 goto clear_hash_noput;
1073         req = hp->md5_req;
1074
1075         if (crypto_ahash_init(req))
1076                 goto clear_hash;
1077         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1078                 goto clear_hash;
1079         if (tcp_md5_hash_key(hp, key))
1080                 goto clear_hash;
1081         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1082         if (crypto_ahash_final(req))
1083                 goto clear_hash;
1084
1085         tcp_put_md5sig_pool();
1086         return 0;
1087
1088 clear_hash:
1089         tcp_put_md5sig_pool();
1090 clear_hash_noput:
1091         memset(md5_hash, 0, 16);
1092         return 1;
1093 }
1094
1095 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1096                         const struct sock *sk,
1097                         const struct sk_buff *skb)
1098 {
1099         struct tcp_md5sig_pool *hp;
1100         struct ahash_request *req;
1101         const struct tcphdr *th = tcp_hdr(skb);
1102         __be32 saddr, daddr;
1103
1104         if (sk) { /* valid for establish/request sockets */
1105                 saddr = sk->sk_rcv_saddr;
1106                 daddr = sk->sk_daddr;
1107         } else {
1108                 const struct iphdr *iph = ip_hdr(skb);
1109                 saddr = iph->saddr;
1110                 daddr = iph->daddr;
1111         }
1112
1113         hp = tcp_get_md5sig_pool();
1114         if (!hp)
1115                 goto clear_hash_noput;
1116         req = hp->md5_req;
1117
1118         if (crypto_ahash_init(req))
1119                 goto clear_hash;
1120
1121         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_key(hp, key))
1126                 goto clear_hash;
1127         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1128         if (crypto_ahash_final(req))
1129                 goto clear_hash;
1130
1131         tcp_put_md5sig_pool();
1132         return 0;
1133
1134 clear_hash:
1135         tcp_put_md5sig_pool();
1136 clear_hash_noput:
1137         memset(md5_hash, 0, 16);
1138         return 1;
1139 }
1140 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1141
1142 #endif
1143
1144 /* Called with rcu_read_lock() */
1145 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1146                                     const struct sk_buff *skb)
1147 {
1148 #ifdef CONFIG_TCP_MD5SIG
1149         /*
1150          * This gets called for each TCP segment that arrives
1151          * so we want to be efficient.
1152          * We have 3 drop cases:
1153          * o No MD5 hash and one expected.
1154          * o MD5 hash and we're not expecting one.
1155          * o MD5 hash and its wrong.
1156          */
1157         const __u8 *hash_location = NULL;
1158         struct tcp_md5sig_key *hash_expected;
1159         const struct iphdr *iph = ip_hdr(skb);
1160         const struct tcphdr *th = tcp_hdr(skb);
1161         int genhash;
1162         unsigned char newhash[16];
1163
1164         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1165                                           AF_INET);
1166         hash_location = tcp_parse_md5sig_option(th);
1167
1168         /* We've parsed the options - do we have a hash? */
1169         if (!hash_expected && !hash_location)
1170                 return false;
1171
1172         if (hash_expected && !hash_location) {
1173                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1174                 return true;
1175         }
1176
1177         if (!hash_expected && hash_location) {
1178                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1179                 return true;
1180         }
1181
1182         /* Okay, so this is hash_expected and hash_location -
1183          * so we need to calculate the checksum.
1184          */
1185         genhash = tcp_v4_md5_hash_skb(newhash,
1186                                       hash_expected,
1187                                       NULL, skb);
1188
1189         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1190                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1191                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1192                                      &iph->saddr, ntohs(th->source),
1193                                      &iph->daddr, ntohs(th->dest),
1194                                      genhash ? " tcp_v4_calc_md5_hash failed"
1195                                      : "");
1196                 return true;
1197         }
1198         return false;
1199 #endif
1200         return false;
1201 }
1202
1203 static void tcp_v4_init_req(struct request_sock *req,
1204                             const struct sock *sk_listener,
1205                             struct sk_buff *skb)
1206 {
1207         struct inet_request_sock *ireq = inet_rsk(req);
1208
1209         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1210         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1211         ireq->opt = tcp_v4_save_options(skb);
1212 }
1213
1214 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1215                                           struct flowi *fl,
1216                                           const struct request_sock *req)
1217 {
1218         return inet_csk_route_req(sk, &fl->u.ip4, req);
1219 }
1220
1221 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1222         .family         =       PF_INET,
1223         .obj_size       =       sizeof(struct tcp_request_sock),
1224         .rtx_syn_ack    =       tcp_rtx_synack,
1225         .send_ack       =       tcp_v4_reqsk_send_ack,
1226         .destructor     =       tcp_v4_reqsk_destructor,
1227         .send_reset     =       tcp_v4_send_reset,
1228         .syn_ack_timeout =      tcp_syn_ack_timeout,
1229 };
1230
1231 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1232         .mss_clamp      =       TCP_MSS_DEFAULT,
1233 #ifdef CONFIG_TCP_MD5SIG
1234         .req_md5_lookup =       tcp_v4_md5_lookup,
1235         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1236 #endif
1237         .init_req       =       tcp_v4_init_req,
1238 #ifdef CONFIG_SYN_COOKIES
1239         .cookie_init_seq =      cookie_v4_init_sequence,
1240 #endif
1241         .route_req      =       tcp_v4_route_req,
1242         .init_seq_tsoff =       tcp_v4_init_seq_and_tsoff,
1243         .send_synack    =       tcp_v4_send_synack,
1244 };
1245
1246 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1247 {
1248         /* Never answer to SYNs send to broadcast or multicast */
1249         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1250                 goto drop;
1251
1252         return tcp_conn_request(&tcp_request_sock_ops,
1253                                 &tcp_request_sock_ipv4_ops, sk, skb);
1254
1255 drop:
1256         tcp_listendrop(sk);
1257         return 0;
1258 }
1259 EXPORT_SYMBOL(tcp_v4_conn_request);
1260
1261
1262 /*
1263  * The three way handshake has completed - we got a valid synack -
1264  * now create the new socket.
1265  */
1266 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1267                                   struct request_sock *req,
1268                                   struct dst_entry *dst,
1269                                   struct request_sock *req_unhash,
1270                                   bool *own_req)
1271 {
1272         struct inet_request_sock *ireq;
1273         struct inet_sock *newinet;
1274         struct tcp_sock *newtp;
1275         struct sock *newsk;
1276 #ifdef CONFIG_TCP_MD5SIG
1277         struct tcp_md5sig_key *key;
1278 #endif
1279         struct ip_options_rcu *inet_opt;
1280
1281         if (sk_acceptq_is_full(sk))
1282                 goto exit_overflow;
1283
1284         newsk = tcp_create_openreq_child(sk, req, skb);
1285         if (!newsk)
1286                 goto exit_nonewsk;
1287
1288         newsk->sk_gso_type = SKB_GSO_TCPV4;
1289         inet_sk_rx_dst_set(newsk, skb);
1290
1291         newtp                 = tcp_sk(newsk);
1292         newinet               = inet_sk(newsk);
1293         ireq                  = inet_rsk(req);
1294         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1295         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1296         newsk->sk_bound_dev_if = ireq->ir_iif;
1297         newinet->inet_saddr           = ireq->ir_loc_addr;
1298         inet_opt              = ireq->opt;
1299         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1300         ireq->opt             = NULL;
1301         newinet->mc_index     = inet_iif(skb);
1302         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1303         newinet->rcv_tos      = ip_hdr(skb)->tos;
1304         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1305         if (inet_opt)
1306                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1307         newinet->inet_id = newtp->write_seq ^ jiffies;
1308
1309         if (!dst) {
1310                 dst = inet_csk_route_child_sock(sk, newsk, req);
1311                 if (!dst)
1312                         goto put_and_exit;
1313         } else {
1314                 /* syncookie case : see end of cookie_v4_check() */
1315         }
1316         sk_setup_caps(newsk, dst);
1317
1318         tcp_ca_openreq_child(newsk, dst);
1319
1320         tcp_sync_mss(newsk, dst_mtu(dst));
1321         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1322
1323         tcp_initialize_rcv_mss(newsk);
1324
1325 #ifdef CONFIG_TCP_MD5SIG
1326         /* Copy over the MD5 key from the original socket */
1327         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328                                 AF_INET);
1329         if (key) {
1330                 /*
1331                  * We're using one, so create a matching key
1332                  * on the newsk structure. If we fail to get
1333                  * memory, then we end up not copying the key
1334                  * across. Shucks.
1335                  */
1336                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1338                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1339         }
1340 #endif
1341
1342         if (__inet_inherit_port(sk, newsk) < 0)
1343                 goto put_and_exit;
1344         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1345         if (*own_req)
1346                 tcp_move_syn(newtp, req);
1347
1348         return newsk;
1349
1350 exit_overflow:
1351         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1352 exit_nonewsk:
1353         dst_release(dst);
1354 exit:
1355         tcp_listendrop(sk);
1356         return NULL;
1357 put_and_exit:
1358         inet_csk_prepare_forced_close(newsk);
1359         tcp_done(newsk);
1360         goto exit;
1361 }
1362 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1363
1364 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1365 {
1366 #ifdef CONFIG_SYN_COOKIES
1367         const struct tcphdr *th = tcp_hdr(skb);
1368
1369         if (!th->syn)
1370                 sk = cookie_v4_check(sk, skb);
1371 #endif
1372         return sk;
1373 }
1374
1375 /* The socket must have it's spinlock held when we get
1376  * here, unless it is a TCP_LISTEN socket.
1377  *
1378  * We have a potential double-lock case here, so even when
1379  * doing backlog processing we use the BH locking scheme.
1380  * This is because we cannot sleep with the original spinlock
1381  * held.
1382  */
1383 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1384 {
1385         struct sock *rsk;
1386
1387         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1388                 struct dst_entry *dst = sk->sk_rx_dst;
1389
1390                 sock_rps_save_rxhash(sk, skb);
1391                 sk_mark_napi_id(sk, skb);
1392                 if (dst) {
1393                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1394                             !dst->ops->check(dst, 0)) {
1395                                 dst_release(dst);
1396                                 sk->sk_rx_dst = NULL;
1397                         }
1398                 }
1399                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1400                 return 0;
1401         }
1402
1403         if (tcp_checksum_complete(skb))
1404                 goto csum_err;
1405
1406         if (sk->sk_state == TCP_LISTEN) {
1407                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1408
1409                 if (!nsk)
1410                         goto discard;
1411                 if (nsk != sk) {
1412                         if (tcp_child_process(sk, nsk, skb)) {
1413                                 rsk = nsk;
1414                                 goto reset;
1415                         }
1416                         return 0;
1417                 }
1418         } else
1419                 sock_rps_save_rxhash(sk, skb);
1420
1421         if (tcp_rcv_state_process(sk, skb)) {
1422                 rsk = sk;
1423                 goto reset;
1424         }
1425         return 0;
1426
1427 reset:
1428         tcp_v4_send_reset(rsk, skb);
1429 discard:
1430         kfree_skb(skb);
1431         /* Be careful here. If this function gets more complicated and
1432          * gcc suffers from register pressure on the x86, sk (in %ebx)
1433          * might be destroyed here. This current version compiles correctly,
1434          * but you have been warned.
1435          */
1436         return 0;
1437
1438 csum_err:
1439         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1440         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1441         goto discard;
1442 }
1443 EXPORT_SYMBOL(tcp_v4_do_rcv);
1444
1445 void tcp_v4_early_demux(struct sk_buff *skb)
1446 {
1447         const struct iphdr *iph;
1448         const struct tcphdr *th;
1449         struct sock *sk;
1450
1451         if (skb->pkt_type != PACKET_HOST)
1452                 return;
1453
1454         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1455                 return;
1456
1457         iph = ip_hdr(skb);
1458         th = tcp_hdr(skb);
1459
1460         if (th->doff < sizeof(struct tcphdr) / 4)
1461                 return;
1462
1463         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1464                                        iph->saddr, th->source,
1465                                        iph->daddr, ntohs(th->dest),
1466                                        skb->skb_iif);
1467         if (sk) {
1468                 skb->sk = sk;
1469                 skb->destructor = sock_edemux;
1470                 if (sk_fullsock(sk)) {
1471                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1472
1473                         if (dst)
1474                                 dst = dst_check(dst, 0);
1475                         if (dst &&
1476                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1477                                 skb_dst_set_noref(skb, dst);
1478                 }
1479         }
1480 }
1481
1482 /* Packet is added to VJ-style prequeue for processing in process
1483  * context, if a reader task is waiting. Apparently, this exciting
1484  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1485  * failed somewhere. Latency? Burstiness? Well, at least now we will
1486  * see, why it failed. 8)8)                               --ANK
1487  *
1488  */
1489 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1490 {
1491         struct tcp_sock *tp = tcp_sk(sk);
1492
1493         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1494                 return false;
1495
1496         if (skb->len <= tcp_hdrlen(skb) &&
1497             skb_queue_len(&tp->ucopy.prequeue) == 0)
1498                 return false;
1499
1500         /* Before escaping RCU protected region, we need to take care of skb
1501          * dst. Prequeue is only enabled for established sockets.
1502          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1503          * Instead of doing full sk_rx_dst validity here, let's perform
1504          * an optimistic check.
1505          */
1506         if (likely(sk->sk_rx_dst))
1507                 skb_dst_drop(skb);
1508         else
1509                 skb_dst_force_safe(skb);
1510
1511         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1512         tp->ucopy.memory += skb->truesize;
1513         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1514             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1515                 struct sk_buff *skb1;
1516
1517                 BUG_ON(sock_owned_by_user(sk));
1518                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1519                                 skb_queue_len(&tp->ucopy.prequeue));
1520
1521                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1522                         sk_backlog_rcv(sk, skb1);
1523
1524                 tp->ucopy.memory = 0;
1525         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1526                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1527                                            POLLIN | POLLRDNORM | POLLRDBAND);
1528                 if (!inet_csk_ack_scheduled(sk))
1529                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1530                                                   (3 * tcp_rto_min(sk)) / 4,
1531                                                   TCP_RTO_MAX);
1532         }
1533         return true;
1534 }
1535 EXPORT_SYMBOL(tcp_prequeue);
1536
1537 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1538 {
1539         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1540
1541         /* Only socket owner can try to collapse/prune rx queues
1542          * to reduce memory overhead, so add a little headroom here.
1543          * Few sockets backlog are possibly concurrently non empty.
1544          */
1545         limit += 64*1024;
1546
1547         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1548          * we can fix skb->truesize to its real value to avoid future drops.
1549          * This is valid because skb is not yet charged to the socket.
1550          * It has been noticed pure SACK packets were sometimes dropped
1551          * (if cooked by drivers without copybreak feature).
1552          */
1553         skb_condense(skb);
1554
1555         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1556                 bh_unlock_sock(sk);
1557                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1558                 return true;
1559         }
1560         return false;
1561 }
1562 EXPORT_SYMBOL(tcp_add_backlog);
1563
1564 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1565 {
1566         struct tcphdr *th = (struct tcphdr *)skb->data;
1567         unsigned int eaten = skb->len;
1568         int err;
1569
1570         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1571         if (!err) {
1572                 eaten -= skb->len;
1573                 TCP_SKB_CB(skb)->end_seq -= eaten;
1574         }
1575         return err;
1576 }
1577 EXPORT_SYMBOL(tcp_filter);
1578
1579 /*
1580  *      From tcp_input.c
1581  */
1582
1583 int tcp_v4_rcv(struct sk_buff *skb)
1584 {
1585         struct net *net = dev_net(skb->dev);
1586         const struct iphdr *iph;
1587         const struct tcphdr *th;
1588         bool refcounted;
1589         struct sock *sk;
1590         int ret;
1591
1592         if (skb->pkt_type != PACKET_HOST)
1593                 goto discard_it;
1594
1595         /* Count it even if it's bad */
1596         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1597
1598         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1599                 goto discard_it;
1600
1601         th = (const struct tcphdr *)skb->data;
1602
1603         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1604                 goto bad_packet;
1605         if (!pskb_may_pull(skb, th->doff * 4))
1606                 goto discard_it;
1607
1608         /* An explanation is required here, I think.
1609          * Packet length and doff are validated by header prediction,
1610          * provided case of th->doff==0 is eliminated.
1611          * So, we defer the checks. */
1612
1613         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1614                 goto csum_error;
1615
1616         th = (const struct tcphdr *)skb->data;
1617         iph = ip_hdr(skb);
1618         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1619          * barrier() makes sure compiler wont play fool^Waliasing games.
1620          */
1621         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1622                 sizeof(struct inet_skb_parm));
1623         barrier();
1624
1625         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1626         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1627                                     skb->len - th->doff * 4);
1628         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1629         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1630         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1631         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1632         TCP_SKB_CB(skb)->sacked  = 0;
1633
1634 lookup:
1635         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1636                                th->dest, &refcounted);
1637         if (!sk)
1638                 goto no_tcp_socket;
1639
1640 process:
1641         if (sk->sk_state == TCP_TIME_WAIT)
1642                 goto do_time_wait;
1643
1644         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1645                 struct request_sock *req = inet_reqsk(sk);
1646                 struct sock *nsk;
1647
1648                 sk = req->rsk_listener;
1649                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1650                         sk_drops_add(sk, skb);
1651                         reqsk_put(req);
1652                         goto discard_it;
1653                 }
1654                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1655                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1656                         goto lookup;
1657                 }
1658                 /* We own a reference on the listener, increase it again
1659                  * as we might lose it too soon.
1660                  */
1661                 sock_hold(sk);
1662                 refcounted = true;
1663                 nsk = tcp_check_req(sk, skb, req, false);
1664                 if (!nsk) {
1665                         reqsk_put(req);
1666                         goto discard_and_relse;
1667                 }
1668                 if (nsk == sk) {
1669                         reqsk_put(req);
1670                 } else if (tcp_child_process(sk, nsk, skb)) {
1671                         tcp_v4_send_reset(nsk, skb);
1672                         goto discard_and_relse;
1673                 } else {
1674                         sock_put(sk);
1675                         return 0;
1676                 }
1677         }
1678         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1679                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1680                 goto discard_and_relse;
1681         }
1682
1683         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1684                 goto discard_and_relse;
1685
1686         if (tcp_v4_inbound_md5_hash(sk, skb))
1687                 goto discard_and_relse;
1688
1689         nf_reset(skb);
1690
1691         if (tcp_filter(sk, skb))
1692                 goto discard_and_relse;
1693         th = (const struct tcphdr *)skb->data;
1694         iph = ip_hdr(skb);
1695
1696         skb->dev = NULL;
1697
1698         if (sk->sk_state == TCP_LISTEN) {
1699                 ret = tcp_v4_do_rcv(sk, skb);
1700                 goto put_and_return;
1701         }
1702
1703         sk_incoming_cpu_update(sk);
1704
1705         bh_lock_sock_nested(sk);
1706         tcp_segs_in(tcp_sk(sk), skb);
1707         ret = 0;
1708         if (!sock_owned_by_user(sk)) {
1709                 if (!tcp_prequeue(sk, skb))
1710                         ret = tcp_v4_do_rcv(sk, skb);
1711         } else if (tcp_add_backlog(sk, skb)) {
1712                 goto discard_and_relse;
1713         }
1714         bh_unlock_sock(sk);
1715
1716 put_and_return:
1717         if (refcounted)
1718                 sock_put(sk);
1719
1720         return ret;
1721
1722 no_tcp_socket:
1723         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1724                 goto discard_it;
1725
1726         if (tcp_checksum_complete(skb)) {
1727 csum_error:
1728                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1729 bad_packet:
1730                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1731         } else {
1732                 tcp_v4_send_reset(NULL, skb);
1733         }
1734
1735 discard_it:
1736         /* Discard frame. */
1737         kfree_skb(skb);
1738         return 0;
1739
1740 discard_and_relse:
1741         sk_drops_add(sk, skb);
1742         if (refcounted)
1743                 sock_put(sk);
1744         goto discard_it;
1745
1746 do_time_wait:
1747         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1748                 inet_twsk_put(inet_twsk(sk));
1749                 goto discard_it;
1750         }
1751
1752         if (tcp_checksum_complete(skb)) {
1753                 inet_twsk_put(inet_twsk(sk));
1754                 goto csum_error;
1755         }
1756         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1757         case TCP_TW_SYN: {
1758                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1759                                                         &tcp_hashinfo, skb,
1760                                                         __tcp_hdrlen(th),
1761                                                         iph->saddr, th->source,
1762                                                         iph->daddr, th->dest,
1763                                                         inet_iif(skb));
1764                 if (sk2) {
1765                         inet_twsk_deschedule_put(inet_twsk(sk));
1766                         sk = sk2;
1767                         refcounted = false;
1768                         goto process;
1769                 }
1770                 /* Fall through to ACK */
1771         }
1772         case TCP_TW_ACK:
1773                 tcp_v4_timewait_ack(sk, skb);
1774                 break;
1775         case TCP_TW_RST:
1776                 tcp_v4_send_reset(sk, skb);
1777                 inet_twsk_deschedule_put(inet_twsk(sk));
1778                 goto discard_it;
1779         case TCP_TW_SUCCESS:;
1780         }
1781         goto discard_it;
1782 }
1783
1784 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1785         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1786         .twsk_unique    = tcp_twsk_unique,
1787         .twsk_destructor= tcp_twsk_destructor,
1788 };
1789
1790 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1791 {
1792         struct dst_entry *dst = skb_dst(skb);
1793
1794         if (dst && dst_hold_safe(dst)) {
1795                 sk->sk_rx_dst = dst;
1796                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1797         }
1798 }
1799 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1800
1801 const struct inet_connection_sock_af_ops ipv4_specific = {
1802         .queue_xmit        = ip_queue_xmit,
1803         .send_check        = tcp_v4_send_check,
1804         .rebuild_header    = inet_sk_rebuild_header,
1805         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1806         .conn_request      = tcp_v4_conn_request,
1807         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1808         .net_header_len    = sizeof(struct iphdr),
1809         .setsockopt        = ip_setsockopt,
1810         .getsockopt        = ip_getsockopt,
1811         .addr2sockaddr     = inet_csk_addr2sockaddr,
1812         .sockaddr_len      = sizeof(struct sockaddr_in),
1813 #ifdef CONFIG_COMPAT
1814         .compat_setsockopt = compat_ip_setsockopt,
1815         .compat_getsockopt = compat_ip_getsockopt,
1816 #endif
1817         .mtu_reduced       = tcp_v4_mtu_reduced,
1818 };
1819 EXPORT_SYMBOL(ipv4_specific);
1820
1821 #ifdef CONFIG_TCP_MD5SIG
1822 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1823         .md5_lookup             = tcp_v4_md5_lookup,
1824         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1825         .md5_parse              = tcp_v4_parse_md5_keys,
1826 };
1827 #endif
1828
1829 /* NOTE: A lot of things set to zero explicitly by call to
1830  *       sk_alloc() so need not be done here.
1831  */
1832 static int tcp_v4_init_sock(struct sock *sk)
1833 {
1834         struct inet_connection_sock *icsk = inet_csk(sk);
1835
1836         tcp_init_sock(sk);
1837
1838         icsk->icsk_af_ops = &ipv4_specific;
1839
1840 #ifdef CONFIG_TCP_MD5SIG
1841         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1842 #endif
1843
1844         return 0;
1845 }
1846
1847 void tcp_v4_destroy_sock(struct sock *sk)
1848 {
1849         struct tcp_sock *tp = tcp_sk(sk);
1850
1851         tcp_clear_xmit_timers(sk);
1852
1853         tcp_cleanup_congestion_control(sk);
1854
1855         /* Cleanup up the write buffer. */
1856         tcp_write_queue_purge(sk);
1857
1858         /* Check if we want to disable active TFO */
1859         tcp_fastopen_active_disable_ofo_check(sk);
1860
1861         /* Cleans up our, hopefully empty, out_of_order_queue. */
1862         skb_rbtree_purge(&tp->out_of_order_queue);
1863
1864 #ifdef CONFIG_TCP_MD5SIG
1865         /* Clean up the MD5 key list, if any */
1866         if (tp->md5sig_info) {
1867                 tcp_clear_md5_list(sk);
1868                 kfree_rcu(tp->md5sig_info, rcu);
1869                 tp->md5sig_info = NULL;
1870         }
1871 #endif
1872
1873         /* Clean prequeue, it must be empty really */
1874         __skb_queue_purge(&tp->ucopy.prequeue);
1875
1876         /* Clean up a referenced TCP bind bucket. */
1877         if (inet_csk(sk)->icsk_bind_hash)
1878                 inet_put_port(sk);
1879
1880         BUG_ON(tp->fastopen_rsk);
1881
1882         /* If socket is aborted during connect operation */
1883         tcp_free_fastopen_req(tp);
1884         tcp_saved_syn_free(tp);
1885
1886         sk_sockets_allocated_dec(sk);
1887 }
1888 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1889
1890 #ifdef CONFIG_PROC_FS
1891 /* Proc filesystem TCP sock list dumping. */
1892
1893 /*
1894  * Get next listener socket follow cur.  If cur is NULL, get first socket
1895  * starting from bucket given in st->bucket; when st->bucket is zero the
1896  * very first socket in the hash table is returned.
1897  */
1898 static void *listening_get_next(struct seq_file *seq, void *cur)
1899 {
1900         struct tcp_iter_state *st = seq->private;
1901         struct net *net = seq_file_net(seq);
1902         struct inet_listen_hashbucket *ilb;
1903         struct sock *sk = cur;
1904
1905         if (!sk) {
1906 get_head:
1907                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1908                 spin_lock(&ilb->lock);
1909                 sk = sk_head(&ilb->head);
1910                 st->offset = 0;
1911                 goto get_sk;
1912         }
1913         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1914         ++st->num;
1915         ++st->offset;
1916
1917         sk = sk_next(sk);
1918 get_sk:
1919         sk_for_each_from(sk) {
1920                 if (!net_eq(sock_net(sk), net))
1921                         continue;
1922                 if (sk->sk_family == st->family)
1923                         return sk;
1924         }
1925         spin_unlock(&ilb->lock);
1926         st->offset = 0;
1927         if (++st->bucket < INET_LHTABLE_SIZE)
1928                 goto get_head;
1929         return NULL;
1930 }
1931
1932 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1933 {
1934         struct tcp_iter_state *st = seq->private;
1935         void *rc;
1936
1937         st->bucket = 0;
1938         st->offset = 0;
1939         rc = listening_get_next(seq, NULL);
1940
1941         while (rc && *pos) {
1942                 rc = listening_get_next(seq, rc);
1943                 --*pos;
1944         }
1945         return rc;
1946 }
1947
1948 static inline bool empty_bucket(const struct tcp_iter_state *st)
1949 {
1950         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1951 }
1952
1953 /*
1954  * Get first established socket starting from bucket given in st->bucket.
1955  * If st->bucket is zero, the very first socket in the hash is returned.
1956  */
1957 static void *established_get_first(struct seq_file *seq)
1958 {
1959         struct tcp_iter_state *st = seq->private;
1960         struct net *net = seq_file_net(seq);
1961         void *rc = NULL;
1962
1963         st->offset = 0;
1964         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1965                 struct sock *sk;
1966                 struct hlist_nulls_node *node;
1967                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1968
1969                 /* Lockless fast path for the common case of empty buckets */
1970                 if (empty_bucket(st))
1971                         continue;
1972
1973                 spin_lock_bh(lock);
1974                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1975                         if (sk->sk_family != st->family ||
1976                             !net_eq(sock_net(sk), net)) {
1977                                 continue;
1978                         }
1979                         rc = sk;
1980                         goto out;
1981                 }
1982                 spin_unlock_bh(lock);
1983         }
1984 out:
1985         return rc;
1986 }
1987
1988 static void *established_get_next(struct seq_file *seq, void *cur)
1989 {
1990         struct sock *sk = cur;
1991         struct hlist_nulls_node *node;
1992         struct tcp_iter_state *st = seq->private;
1993         struct net *net = seq_file_net(seq);
1994
1995         ++st->num;
1996         ++st->offset;
1997
1998         sk = sk_nulls_next(sk);
1999
2000         sk_nulls_for_each_from(sk, node) {
2001                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2002                         return sk;
2003         }
2004
2005         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2006         ++st->bucket;
2007         return established_get_first(seq);
2008 }
2009
2010 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2011 {
2012         struct tcp_iter_state *st = seq->private;
2013         void *rc;
2014
2015         st->bucket = 0;
2016         rc = established_get_first(seq);
2017
2018         while (rc && pos) {
2019                 rc = established_get_next(seq, rc);
2020                 --pos;
2021         }
2022         return rc;
2023 }
2024
2025 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2026 {
2027         void *rc;
2028         struct tcp_iter_state *st = seq->private;
2029
2030         st->state = TCP_SEQ_STATE_LISTENING;
2031         rc        = listening_get_idx(seq, &pos);
2032
2033         if (!rc) {
2034                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2035                 rc        = established_get_idx(seq, pos);
2036         }
2037
2038         return rc;
2039 }
2040
2041 static void *tcp_seek_last_pos(struct seq_file *seq)
2042 {
2043         struct tcp_iter_state *st = seq->private;
2044         int offset = st->offset;
2045         int orig_num = st->num;
2046         void *rc = NULL;
2047
2048         switch (st->state) {
2049         case TCP_SEQ_STATE_LISTENING:
2050                 if (st->bucket >= INET_LHTABLE_SIZE)
2051                         break;
2052                 st->state = TCP_SEQ_STATE_LISTENING;
2053                 rc = listening_get_next(seq, NULL);
2054                 while (offset-- && rc)
2055                         rc = listening_get_next(seq, rc);
2056                 if (rc)
2057                         break;
2058                 st->bucket = 0;
2059                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2060                 /* Fallthrough */
2061         case TCP_SEQ_STATE_ESTABLISHED:
2062                 if (st->bucket > tcp_hashinfo.ehash_mask)
2063                         break;
2064                 rc = established_get_first(seq);
2065                 while (offset-- && rc)
2066                         rc = established_get_next(seq, rc);
2067         }
2068
2069         st->num = orig_num;
2070
2071         return rc;
2072 }
2073
2074 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2075 {
2076         struct tcp_iter_state *st = seq->private;
2077         void *rc;
2078
2079         if (*pos && *pos == st->last_pos) {
2080                 rc = tcp_seek_last_pos(seq);
2081                 if (rc)
2082                         goto out;
2083         }
2084
2085         st->state = TCP_SEQ_STATE_LISTENING;
2086         st->num = 0;
2087         st->bucket = 0;
2088         st->offset = 0;
2089         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2090
2091 out:
2092         st->last_pos = *pos;
2093         return rc;
2094 }
2095
2096 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2097 {
2098         struct tcp_iter_state *st = seq->private;
2099         void *rc = NULL;
2100
2101         if (v == SEQ_START_TOKEN) {
2102                 rc = tcp_get_idx(seq, 0);
2103                 goto out;
2104         }
2105
2106         switch (st->state) {
2107         case TCP_SEQ_STATE_LISTENING:
2108                 rc = listening_get_next(seq, v);
2109                 if (!rc) {
2110                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2111                         st->bucket = 0;
2112                         st->offset = 0;
2113                         rc        = established_get_first(seq);
2114                 }
2115                 break;
2116         case TCP_SEQ_STATE_ESTABLISHED:
2117                 rc = established_get_next(seq, v);
2118                 break;
2119         }
2120 out:
2121         ++*pos;
2122         st->last_pos = *pos;
2123         return rc;
2124 }
2125
2126 static void tcp_seq_stop(struct seq_file *seq, void *v)
2127 {
2128         struct tcp_iter_state *st = seq->private;
2129
2130         switch (st->state) {
2131         case TCP_SEQ_STATE_LISTENING:
2132                 if (v != SEQ_START_TOKEN)
2133                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2134                 break;
2135         case TCP_SEQ_STATE_ESTABLISHED:
2136                 if (v)
2137                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2138                 break;
2139         }
2140 }
2141
2142 int tcp_seq_open(struct inode *inode, struct file *file)
2143 {
2144         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2145         struct tcp_iter_state *s;
2146         int err;
2147
2148         err = seq_open_net(inode, file, &afinfo->seq_ops,
2149                           sizeof(struct tcp_iter_state));
2150         if (err < 0)
2151                 return err;
2152
2153         s = ((struct seq_file *)file->private_data)->private;
2154         s->family               = afinfo->family;
2155         s->last_pos             = 0;
2156         return 0;
2157 }
2158 EXPORT_SYMBOL(tcp_seq_open);
2159
2160 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2161 {
2162         int rc = 0;
2163         struct proc_dir_entry *p;
2164
2165         afinfo->seq_ops.start           = tcp_seq_start;
2166         afinfo->seq_ops.next            = tcp_seq_next;
2167         afinfo->seq_ops.stop            = tcp_seq_stop;
2168
2169         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2170                              afinfo->seq_fops, afinfo);
2171         if (!p)
2172                 rc = -ENOMEM;
2173         return rc;
2174 }
2175 EXPORT_SYMBOL(tcp_proc_register);
2176
2177 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2178 {
2179         remove_proc_entry(afinfo->name, net->proc_net);
2180 }
2181 EXPORT_SYMBOL(tcp_proc_unregister);
2182
2183 static void get_openreq4(const struct request_sock *req,
2184                          struct seq_file *f, int i)
2185 {
2186         const struct inet_request_sock *ireq = inet_rsk(req);
2187         long delta = req->rsk_timer.expires - jiffies;
2188
2189         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2190                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2191                 i,
2192                 ireq->ir_loc_addr,
2193                 ireq->ir_num,
2194                 ireq->ir_rmt_addr,
2195                 ntohs(ireq->ir_rmt_port),
2196                 TCP_SYN_RECV,
2197                 0, 0, /* could print option size, but that is af dependent. */
2198                 1,    /* timers active (only the expire timer) */
2199                 jiffies_delta_to_clock_t(delta),
2200                 req->num_timeout,
2201                 from_kuid_munged(seq_user_ns(f),
2202                                  sock_i_uid(req->rsk_listener)),
2203                 0,  /* non standard timer */
2204                 0, /* open_requests have no inode */
2205                 0,
2206                 req);
2207 }
2208
2209 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2210 {
2211         int timer_active;
2212         unsigned long timer_expires;
2213         const struct tcp_sock *tp = tcp_sk(sk);
2214         const struct inet_connection_sock *icsk = inet_csk(sk);
2215         const struct inet_sock *inet = inet_sk(sk);
2216         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2217         __be32 dest = inet->inet_daddr;
2218         __be32 src = inet->inet_rcv_saddr;
2219         __u16 destp = ntohs(inet->inet_dport);
2220         __u16 srcp = ntohs(inet->inet_sport);
2221         int rx_queue;
2222         int state;
2223
2224         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2225             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2226             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2227                 timer_active    = 1;
2228                 timer_expires   = icsk->icsk_timeout;
2229         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2230                 timer_active    = 4;
2231                 timer_expires   = icsk->icsk_timeout;
2232         } else if (timer_pending(&sk->sk_timer)) {
2233                 timer_active    = 2;
2234                 timer_expires   = sk->sk_timer.expires;
2235         } else {
2236                 timer_active    = 0;
2237                 timer_expires = jiffies;
2238         }
2239
2240         state = sk_state_load(sk);
2241         if (state == TCP_LISTEN)
2242                 rx_queue = sk->sk_ack_backlog;
2243         else
2244                 /* Because we don't lock the socket,
2245                  * we might find a transient negative value.
2246                  */
2247                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2248
2249         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2250                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2251                 i, src, srcp, dest, destp, state,
2252                 tp->write_seq - tp->snd_una,
2253                 rx_queue,
2254                 timer_active,
2255                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2256                 icsk->icsk_retransmits,
2257                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2258                 icsk->icsk_probes_out,
2259                 sock_i_ino(sk),
2260                 atomic_read(&sk->sk_refcnt), sk,
2261                 jiffies_to_clock_t(icsk->icsk_rto),
2262                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2263                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2264                 tp->snd_cwnd,
2265                 state == TCP_LISTEN ?
2266                     fastopenq->max_qlen :
2267                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2268 }
2269
2270 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2271                                struct seq_file *f, int i)
2272 {
2273         long delta = tw->tw_timer.expires - jiffies;
2274         __be32 dest, src;
2275         __u16 destp, srcp;
2276
2277         dest  = tw->tw_daddr;
2278         src   = tw->tw_rcv_saddr;
2279         destp = ntohs(tw->tw_dport);
2280         srcp  = ntohs(tw->tw_sport);
2281
2282         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2283                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2284                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2285                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2286                 atomic_read(&tw->tw_refcnt), tw);
2287 }
2288
2289 #define TMPSZ 150
2290
2291 static int tcp4_seq_show(struct seq_file *seq, void *v)
2292 {
2293         struct tcp_iter_state *st;
2294         struct sock *sk = v;
2295
2296         seq_setwidth(seq, TMPSZ - 1);
2297         if (v == SEQ_START_TOKEN) {
2298                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2299                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2300                            "inode");
2301                 goto out;
2302         }
2303         st = seq->private;
2304
2305         if (sk->sk_state == TCP_TIME_WAIT)
2306                 get_timewait4_sock(v, seq, st->num);
2307         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2308                 get_openreq4(v, seq, st->num);
2309         else
2310                 get_tcp4_sock(v, seq, st->num);
2311 out:
2312         seq_pad(seq, '\n');
2313         return 0;
2314 }
2315
2316 static const struct file_operations tcp_afinfo_seq_fops = {
2317         .owner   = THIS_MODULE,
2318         .open    = tcp_seq_open,
2319         .read    = seq_read,
2320         .llseek  = seq_lseek,
2321         .release = seq_release_net
2322 };
2323
2324 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2325         .name           = "tcp",
2326         .family         = AF_INET,
2327         .seq_fops       = &tcp_afinfo_seq_fops,
2328         .seq_ops        = {
2329                 .show           = tcp4_seq_show,
2330         },
2331 };
2332
2333 static int __net_init tcp4_proc_init_net(struct net *net)
2334 {
2335         return tcp_proc_register(net, &tcp4_seq_afinfo);
2336 }
2337
2338 static void __net_exit tcp4_proc_exit_net(struct net *net)
2339 {
2340         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2341 }
2342
2343 static struct pernet_operations tcp4_net_ops = {
2344         .init = tcp4_proc_init_net,
2345         .exit = tcp4_proc_exit_net,
2346 };
2347
2348 int __init tcp4_proc_init(void)
2349 {
2350         return register_pernet_subsys(&tcp4_net_ops);
2351 }
2352
2353 void tcp4_proc_exit(void)
2354 {
2355         unregister_pernet_subsys(&tcp4_net_ops);
2356 }
2357 #endif /* CONFIG_PROC_FS */
2358
2359 struct proto tcp_prot = {
2360         .name                   = "TCP",
2361         .owner                  = THIS_MODULE,
2362         .close                  = tcp_close,
2363         .connect                = tcp_v4_connect,
2364         .disconnect             = tcp_disconnect,
2365         .accept                 = inet_csk_accept,
2366         .ioctl                  = tcp_ioctl,
2367         .init                   = tcp_v4_init_sock,
2368         .destroy                = tcp_v4_destroy_sock,
2369         .shutdown               = tcp_shutdown,
2370         .setsockopt             = tcp_setsockopt,
2371         .getsockopt             = tcp_getsockopt,
2372         .keepalive              = tcp_set_keepalive,
2373         .recvmsg                = tcp_recvmsg,
2374         .sendmsg                = tcp_sendmsg,
2375         .sendpage               = tcp_sendpage,
2376         .backlog_rcv            = tcp_v4_do_rcv,
2377         .release_cb             = tcp_release_cb,
2378         .hash                   = inet_hash,
2379         .unhash                 = inet_unhash,
2380         .get_port               = inet_csk_get_port,
2381         .enter_memory_pressure  = tcp_enter_memory_pressure,
2382         .stream_memory_free     = tcp_stream_memory_free,
2383         .sockets_allocated      = &tcp_sockets_allocated,
2384         .orphan_count           = &tcp_orphan_count,
2385         .memory_allocated       = &tcp_memory_allocated,
2386         .memory_pressure        = &tcp_memory_pressure,
2387         .sysctl_mem             = sysctl_tcp_mem,
2388         .sysctl_wmem            = sysctl_tcp_wmem,
2389         .sysctl_rmem            = sysctl_tcp_rmem,
2390         .max_header             = MAX_TCP_HEADER,
2391         .obj_size               = sizeof(struct tcp_sock),
2392         .slab_flags             = SLAB_DESTROY_BY_RCU,
2393         .twsk_prot              = &tcp_timewait_sock_ops,
2394         .rsk_prot               = &tcp_request_sock_ops,
2395         .h.hashinfo             = &tcp_hashinfo,
2396         .no_autobind            = true,
2397 #ifdef CONFIG_COMPAT
2398         .compat_setsockopt      = compat_tcp_setsockopt,
2399         .compat_getsockopt      = compat_tcp_getsockopt,
2400 #endif
2401         .diag_destroy           = tcp_abort,
2402 };
2403 EXPORT_SYMBOL(tcp_prot);
2404
2405 static void __net_exit tcp_sk_exit(struct net *net)
2406 {
2407         int cpu;
2408
2409         for_each_possible_cpu(cpu)
2410                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2411         free_percpu(net->ipv4.tcp_sk);
2412 }
2413
2414 static int __net_init tcp_sk_init(struct net *net)
2415 {
2416         int res, cpu, cnt;
2417
2418         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2419         if (!net->ipv4.tcp_sk)
2420                 return -ENOMEM;
2421
2422         for_each_possible_cpu(cpu) {
2423                 struct sock *sk;
2424
2425                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2426                                            IPPROTO_TCP, net);
2427                 if (res)
2428                         goto fail;
2429                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2430                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2431         }
2432
2433         net->ipv4.sysctl_tcp_ecn = 2;
2434         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2435
2436         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2437         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2438         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2439
2440         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2441         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2442         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2443
2444         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2445         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2446         net->ipv4.sysctl_tcp_syncookies = 1;
2447         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2448         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2449         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2450         net->ipv4.sysctl_tcp_orphan_retries = 0;
2451         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2452         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2453         net->ipv4.sysctl_tcp_tw_reuse = 0;
2454
2455         cnt = tcp_hashinfo.ehash_mask + 1;
2456         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2457         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2458
2459         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2460
2461         return 0;
2462 fail:
2463         tcp_sk_exit(net);
2464
2465         return res;
2466 }
2467
2468 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2469 {
2470         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2471 }
2472
2473 static struct pernet_operations __net_initdata tcp_sk_ops = {
2474        .init       = tcp_sk_init,
2475        .exit       = tcp_sk_exit,
2476        .exit_batch = tcp_sk_exit_batch,
2477 };
2478
2479 void __init tcp_v4_init(void)
2480 {
2481         if (register_pernet_subsys(&tcp_sk_ops))
2482                 panic("Failed to create the TCP control socket.\n");
2483 }