net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/secure_seq.h>
  76 #include <net/busy_poll.h>
  77
  78 #include <linux/inet.h>
  79 #include <linux/ipv6.h>
  80 #include <linux/stddef.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/seq_file.h>
  83
  84 #include <crypto/hash.h>
  85 #include <linux/scatterlist.h>
  86
  87 int sysctl_tcp_low_latency __read_mostly;
  88
  89 #ifdef CONFIG_TCP_MD5SIG
  90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  91                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  92 #endif
  93
  94 struct inet_hashinfo tcp_hashinfo;
  95 EXPORT_SYMBOL(tcp_hashinfo);
  96
  97 static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
  98 {
  99         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 100                                           ip_hdr(skb)->saddr,
 101                                           tcp_hdr(skb)->dest,
 102                                           tcp_hdr(skb)->source, tsoff);
 103 }
 104
 105 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 106 {
 107         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 108         struct tcp_sock *tp = tcp_sk(sk);
 109
 110         /* With PAWS, it is safe from the viewpoint
 111            of data integrity. Even without PAWS it is safe provided sequence
 112            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 113
 114            Actually, the idea is close to VJ's one, only timestamp cache is
 115            held not per host, but per port pair and TW bucket is used as state
 116            holder.
 117
 118            If TW bucket has been already destroyed we fall back to VJ's scheme
 119            and use initial timestamp retrieved from peer table.
 120          */
 121         if (tcptw->tw_ts_recent_stamp &&
 122             (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 123                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 124                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 125                 if (tp->write_seq == 0)
 126                         tp->write_seq = 1;
 127                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 128                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 129                 sock_hold(sktw);
 130                 return 1;
 131         }
 132
 133         return 0;
 134 }
 135 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 136
 137 /* This will initiate an outgoing connection. */
 138 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 139 {
 140         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 141         struct inet_sock *inet = inet_sk(sk);
 142         struct tcp_sock *tp = tcp_sk(sk);
 143         __be16 orig_sport, orig_dport;
 144         __be32 daddr, nexthop;
 145         struct flowi4 *fl4;
 146         struct rtable *rt;
 147         int err;
 148         u32 seq;
 149         struct ip_options_rcu *inet_opt;
 150         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 151
 152         if (addr_len < sizeof(struct sockaddr_in))
 153                 return -EINVAL;
 154
 155         if (usin->sin_family != AF_INET)
 156                 return -EAFNOSUPPORT;
 157
 158         nexthop = daddr = usin->sin_addr.s_addr;
 159         inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                              lockdep_sock_is_held(sk));
 161         if (inet_opt && inet_opt->opt.srr) {
 162                 if (!daddr)
 163                         return -EINVAL;
 164                 nexthop = inet_opt->opt.faddr;
 165         }
 166
 167         orig_sport = inet->inet_sport;
 168         orig_dport = usin->sin_port;
 169         fl4 = &inet->cork.fl.u.ip4;
 170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               orig_sport, orig_dport, sk);
 174         if (IS_ERR(rt)) {
 175                 err = PTR_ERR(rt);
 176                 if (err == -ENETUNREACH)
 177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                 return err;
 179         }
 180
 181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                 ip_rt_put(rt);
 183                 return -ENETUNREACH;
 184         }
 185
 186         if (!inet_opt || !inet_opt->opt.srr)
 187                 daddr = fl4->daddr;
 188
 189         if (!inet->inet_saddr)
 190                 inet->inet_saddr = fl4->saddr;
 191         sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                 /* Reset inherited state */
 195                 tp->rx_opt.ts_recent       = 0;
 196                 tp->rx_opt.ts_recent_stamp = 0;
 197                 if (likely(!tp->repair))
 198                         tp->write_seq      = 0;
 199         }
 200
 201         if (tcp_death_row->sysctl_tw_recycle &&
 202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 204
 205         inet->inet_dport = usin->sin_port;
 206         sk_daddr_set(sk, daddr);
 207
 208         inet_csk(sk)->icsk_ext_hdr_len = 0;
 209         if (inet_opt)
 210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 211
 212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 213
 214         /* Socket identity is still unknown (sport may be zero).
 215          * However we set state to SYN-SENT and not releasing socket
 216          * lock select source port, enter ourselves into the hash tables and
 217          * complete initialization after this.
 218          */
 219         tcp_set_state(sk, TCP_SYN_SENT);
 220         err = inet_hash_connect(tcp_death_row, sk);
 221         if (err)
 222                 goto failure;
 223
 224         sk_set_txhash(sk);
 225
 226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                                inet->inet_sport, inet->inet_dport, sk);
 228         if (IS_ERR(rt)) {
 229                 err = PTR_ERR(rt);
 230                 rt = NULL;
 231                 goto failure;
 232         }
 233         /* OK, now commit destination to socket.  */
 234         sk->sk_gso_type = SKB_GSO_TCPV4;
 235         sk_setup_caps(sk, &rt->dst);
 236         rt = NULL;
 237
 238         if (likely(!tp->repair)) {
 239                 seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                  inet->inet_daddr,
 241                                                  inet->inet_sport,
 242                                                  usin->sin_port,
 243                                                  &tp->tsoffset);
 244                 if (!tp->write_seq)
 245                         tp->write_seq = seq;
 246         }
 247
 248         inet->inet_id = tp->write_seq ^ jiffies;
 249
 250         if (tcp_fastopen_defer_connect(sk, &err))
 251                 return err;
 252         if (err)
 253                 goto failure;
 254
 255         err = tcp_connect(sk);
 256
 257         if (err)
 258                 goto failure;
 259
 260         return 0;
 261
 262 failure:
 263         /*
 264          * This unhashes the socket and releases the local port,
 265          * if necessary.
 266          */
 267         tcp_set_state(sk, TCP_CLOSE);
 268         ip_rt_put(rt);
 269         sk->sk_route_caps = 0;
 270         inet->inet_dport = 0;
 271         return err;
 272 }
 273 EXPORT_SYMBOL(tcp_v4_connect);
 274
 275 /*
 276  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 277  * It can be called through tcp_release_cb() if socket was owned by user
 278  * at the time tcp_v4_err() was called to handle ICMP message.
 279  */
 280 void tcp_v4_mtu_reduced(struct sock *sk)
 281 {
 282         struct dst_entry *dst;
 283         struct inet_sock *inet = inet_sk(sk);
 284         u32 mtu = tcp_sk(sk)->mtu_info;
 285
 286         dst = inet_csk_update_pmtu(sk, mtu);
 287         if (!dst)
 288                 return;
 289
 290         /* Something is about to be wrong... Remember soft error
 291          * for the case, if this connection will not able to recover.
 292          */
 293         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 294                 sk->sk_err_soft = EMSGSIZE;
 295
 296         mtu = dst_mtu(dst);
 297
 298         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 299             ip_sk_accept_pmtu(sk) &&
 300             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 301                 tcp_sync_mss(sk, mtu);
 302
 303                 /* Resend the TCP packet because it's
 304                  * clear that the old packet has been
 305                  * dropped. This is the new "fast" path mtu
 306                  * discovery.
 307                  */
 308                 tcp_simple_retransmit(sk);
 309         } /* else let the usual retransmit timer handle it */
 310 }
 311 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 312
 313 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 314 {
 315         struct dst_entry *dst = __sk_dst_check(sk, 0);
 316
 317         if (dst)
 318                 dst->ops->redirect(dst, sk, skb);
 319 }
 320
 321
 322 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 323 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 324 {
 325         struct request_sock *req = inet_reqsk(sk);
 326         struct net *net = sock_net(sk);
 327
 328         /* ICMPs are not backlogged, hence we cannot get
 329          * an established socket here.
 330          */
 331         if (seq != tcp_rsk(req)->snt_isn) {
 332                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 333         } else if (abort) {
 334                 /*
 335                  * Still in SYN_RECV, just remove it silently.
 336                  * There is no good way to pass the error to the newly
 337                  * created socket, and POSIX does not want network
 338                  * errors returned from accept().
 339                  */
 340                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 341                 tcp_listendrop(req->rsk_listener);
 342         }
 343         reqsk_put(req);
 344 }
 345 EXPORT_SYMBOL(tcp_req_err);
 346
 347 /*
 348  * This routine is called by the ICMP module when it gets some
 349  * sort of error condition.  If err < 0 then the socket should
 350  * be closed and the error returned to the user.  If err > 0
 351  * it's just the icmp type << 8 | icmp code.  After adjustment
 352  * header points to the first 8 bytes of the tcp header.  We need
 353  * to find the appropriate port.
 354  *
 355  * The locking strategy used here is very "optimistic". When
 356  * someone else accesses the socket the ICMP is just dropped
 357  * and for some paths there is no check at all.
 358  * A more general error queue to queue errors for later handling
 359  * is probably better.
 360  *
 361  */
 362
 363 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 364 {
 365         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 366         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 367         struct inet_connection_sock *icsk;
 368         struct tcp_sock *tp;
 369         struct inet_sock *inet;
 370         const int type = icmp_hdr(icmp_skb)->type;
 371         const int code = icmp_hdr(icmp_skb)->code;
 372         struct sock *sk;
 373         struct sk_buff *skb;
 374         struct request_sock *fastopen;
 375         __u32 seq, snd_una;
 376         __u32 remaining;
 377         int err;
 378         struct net *net = dev_net(icmp_skb->dev);
 379
 380         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 381                                        th->dest, iph->saddr, ntohs(th->source),
 382                                        inet_iif(icmp_skb));
 383         if (!sk) {
 384                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 385                 return;
 386         }
 387         if (sk->sk_state == TCP_TIME_WAIT) {
 388                 inet_twsk_put(inet_twsk(sk));
 389                 return;
 390         }
 391         seq = ntohl(th->seq);
 392         if (sk->sk_state == TCP_NEW_SYN_RECV)
 393                 return tcp_req_err(sk, seq,
 394                                   type == ICMP_PARAMETERPROB ||
 395                                   type == ICMP_TIME_EXCEEDED ||
 396                                   (type == ICMP_DEST_UNREACH &&
 397                                    (code == ICMP_NET_UNREACH ||
 398                                     code == ICMP_HOST_UNREACH)));
 399
 400         bh_lock_sock(sk);
 401         /* If too many ICMPs get dropped on busy
 402          * servers this needs to be solved differently.
 403          * We do take care of PMTU discovery (RFC1191) special case :
 404          * we can receive locally generated ICMP messages while socket is held.
 405          */
 406         if (sock_owned_by_user(sk)) {
 407                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 408                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 409         }
 410         if (sk->sk_state == TCP_CLOSE)
 411                 goto out;
 412
 413         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 414                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 415                 goto out;
 416         }
 417
 418         icsk = inet_csk(sk);
 419         tp = tcp_sk(sk);
 420         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 421         fastopen = tp->fastopen_rsk;
 422         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 423         if (sk->sk_state != TCP_LISTEN &&
 424             !between(seq, snd_una, tp->snd_nxt)) {
 425                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 426                 goto out;
 427         }
 428
 429         switch (type) {
 430         case ICMP_REDIRECT:
 431                 do_redirect(icmp_skb, sk);
 432                 goto out;
 433         case ICMP_SOURCE_QUENCH:
 434                 /* Just silently ignore these. */
 435                 goto out;
 436         case ICMP_PARAMETERPROB:
 437                 err = EPROTO;
 438                 break;
 439         case ICMP_DEST_UNREACH:
 440                 if (code > NR_ICMP_UNREACH)
 441                         goto out;
 442
 443                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 444                         /* We are not interested in TCP_LISTEN and open_requests
 445                          * (SYN-ACKs send out by Linux are always <576bytes so
 446                          * they should go through unfragmented).
 447                          */
 448                         if (sk->sk_state == TCP_LISTEN)
 449                                 goto out;
 450
 451                         tp->mtu_info = info;
 452                         if (!sock_owned_by_user(sk)) {
 453                                 tcp_v4_mtu_reduced(sk);
 454                         } else {
 455                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 456                                         sock_hold(sk);
 457                         }
 458                         goto out;
 459                 }
 460
 461                 err = icmp_err_convert[code].errno;
 462                 /* check if icmp_skb allows revert of backoff
 463                  * (see draft-zimmermann-tcp-lcd) */
 464                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 465                         break;
 466                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 467                     !icsk->icsk_backoff || fastopen)
 468                         break;
 469
 470                 if (sock_owned_by_user(sk))
 471                         break;
 472
 473                 icsk->icsk_backoff--;
 474                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 475                                                TCP_TIMEOUT_INIT;
 476                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 477
 478                 skb = tcp_write_queue_head(sk);
 479                 BUG_ON(!skb);
 480
 481                 remaining = icsk->icsk_rto -
 482                             min(icsk->icsk_rto,
 483                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 484
 485                 if (remaining) {
 486                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 487                                                   remaining, TCP_RTO_MAX);
 488                 } else {
 489                         /* RTO revert clocked out retransmission.
 490                          * Will retransmit now */
 491                         tcp_retransmit_timer(sk);
 492                 }
 493
 494                 break;
 495         case ICMP_TIME_EXCEEDED:
 496                 err = EHOSTUNREACH;
 497                 break;
 498         default:
 499                 goto out;
 500         }
 501
 502         switch (sk->sk_state) {
 503         case TCP_SYN_SENT:
 504         case TCP_SYN_RECV:
 505                 /* Only in fast or simultaneous open. If a fast open socket is
 506                  * is already accepted it is treated as a connected one below.
 507                  */
 508                 if (fastopen && !fastopen->sk)
 509                         break;
 510
 511                 if (!sock_owned_by_user(sk)) {
 512                         sk->sk_err = err;
 513
 514                         sk->sk_error_report(sk);
 515
 516                         tcp_done(sk);
 517                 } else {
 518                         sk->sk_err_soft = err;
 519                 }
 520                 goto out;
 521         }
 522
 523         /* If we've already connected we will keep trying
 524          * until we time out, or the user gives up.
 525          *
 526          * rfc1122 4.2.3.9 allows to consider as hard errors
 527          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 528          * but it is obsoleted by pmtu discovery).
 529          *
 530          * Note, that in modern internet, where routing is unreliable
 531          * and in each dark corner broken firewalls sit, sending random
 532          * errors ordered by their masters even this two messages finally lose
 533          * their original sense (even Linux sends invalid PORT_UNREACHs)
 534          *
 535          * Now we are in compliance with RFCs.
 536          *                                                      --ANK (980905)
 537          */
 538
 539         inet = inet_sk(sk);
 540         if (!sock_owned_by_user(sk) && inet->recverr) {
 541                 sk->sk_err = err;
 542                 sk->sk_error_report(sk);
 543         } else  { /* Only an error on timeout */
 544                 sk->sk_err_soft = err;
 545         }
 546
 547 out:
 548         bh_unlock_sock(sk);
 549         sock_put(sk);
 550 }
 551
 552 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 553 {
 554         struct tcphdr *th = tcp_hdr(skb);
 555
 556         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 557                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 558                 skb->csum_start = skb_transport_header(skb) - skb->head;
 559                 skb->csum_offset = offsetof(struct tcphdr, check);
 560         } else {
 561                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 562                                          csum_partial(th,
 563                                                       th->doff << 2,
 564                                                       skb->csum));
 565         }
 566 }
 567
 568 /* This routine computes an IPv4 TCP checksum. */
 569 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 570 {
 571         const struct inet_sock *inet = inet_sk(sk);
 572
 573         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 574 }
 575 EXPORT_SYMBOL(tcp_v4_send_check);
 576
 577 /*
 578  *      This routine will send an RST to the other tcp.
 579  *
 580  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 581  *                    for reset.
 582  *      Answer: if a packet caused RST, it is not for a socket
 583  *              existing in our system, if it is matched to a socket,
 584  *              it is just duplicate segment or bug in other side's TCP.
 585  *              So that we build reply only basing on parameters
 586  *              arrived with segment.
 587  *      Exception: precedence violation. We do not implement it in any case.
 588  */
 589
 590 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 591 {
 592         const struct tcphdr *th = tcp_hdr(skb);
 593         struct {
 594                 struct tcphdr th;
 595 #ifdef CONFIG_TCP_MD5SIG
 596                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 597 #endif
 598         } rep;
 599         struct ip_reply_arg arg;
 600 #ifdef CONFIG_TCP_MD5SIG
 601         struct tcp_md5sig_key *key = NULL;
 602         const __u8 *hash_location = NULL;
 603         unsigned char newhash[16];
 604         int genhash;
 605         struct sock *sk1 = NULL;
 606 #endif
 607         struct net *net;
 608
 609         /* Never send a reset in response to a reset. */
 610         if (th->rst)
 611                 return;
 612
 613         /* If sk not NULL, it means we did a successful lookup and incoming
 614          * route had to be correct. prequeue might have dropped our dst.
 615          */
 616         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 617                 return;
 618
 619         /* Swap the send and the receive. */
 620         memset(&rep, 0, sizeof(rep));
 621         rep.th.dest   = th->source;
 622         rep.th.source = th->dest;
 623         rep.th.doff   = sizeof(struct tcphdr) / 4;
 624         rep.th.rst    = 1;
 625
 626         if (th->ack) {
 627                 rep.th.seq = th->ack_seq;
 628         } else {
 629                 rep.th.ack = 1;
 630                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 631                                        skb->len - (th->doff << 2));
 632         }
 633
 634         memset(&arg, 0, sizeof(arg));
 635         arg.iov[0].iov_base = (unsigned char *)&rep;
 636         arg.iov[0].iov_len  = sizeof(rep.th);
 637
 638         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 639 #ifdef CONFIG_TCP_MD5SIG
 640         rcu_read_lock();
 641         hash_location = tcp_parse_md5sig_option(th);
 642         if (sk && sk_fullsock(sk)) {
 643                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 644                                         &ip_hdr(skb)->saddr, AF_INET);
 645         } else if (hash_location) {
 646                 /*
 647                  * active side is lost. Try to find listening socket through
 648                  * source port, and then find md5 key through listening socket.
 649                  * we are not loose security here:
 650                  * Incoming packet is checked with md5 hash with finding key,
 651                  * no RST generated if md5 hash doesn't match.
 652                  */
 653                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 654                                              ip_hdr(skb)->saddr,
 655                                              th->source, ip_hdr(skb)->daddr,
 656                                              ntohs(th->source), inet_iif(skb));
 657                 /* don't send rst if it can't find key */
 658                 if (!sk1)
 659                         goto out;
 660
 661                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 662                                         &ip_hdr(skb)->saddr, AF_INET);
 663                 if (!key)
 664                         goto out;
 665
 666
 667                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 668                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 669                         goto out;
 670
 671         }
 672
 673         if (key) {
 674                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 675                                    (TCPOPT_NOP << 16) |
 676                                    (TCPOPT_MD5SIG << 8) |
 677                                    TCPOLEN_MD5SIG);
 678                 /* Update length and the length the header thinks exists */
 679                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 680                 rep.th.doff = arg.iov[0].iov_len / 4;
 681
 682                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 683                                      key, ip_hdr(skb)->saddr,
 684                                      ip_hdr(skb)->daddr, &rep.th);
 685         }
 686 #endif
 687         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 688                                       ip_hdr(skb)->saddr, /* XXX */
 689                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 690         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 691         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 692
 693         /* When socket is gone, all binding information is lost.
 694          * routing might fail in this case. No choice here, if we choose to force
 695          * input interface, we will misroute in case of asymmetric route.
 696          */
 697         if (sk)
 698                 arg.bound_dev_if = sk->sk_bound_dev_if;
 699
 700         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 701                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 702
 703         arg.tos = ip_hdr(skb)->tos;
 704         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 705         local_bh_disable();
 706         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 707                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 708                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 709                               &arg, arg.iov[0].iov_len);
 710
 711         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 712         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 713         local_bh_enable();
 714
 715 #ifdef CONFIG_TCP_MD5SIG
 716 out:
 717         rcu_read_unlock();
 718 #endif
 719 }
 720
 721 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 722    outside socket context is ugly, certainly. What can I do?
 723  */
 724
 725 static void tcp_v4_send_ack(const struct sock *sk,
 726                             struct sk_buff *skb, u32 seq, u32 ack,
 727                             u32 win, u32 tsval, u32 tsecr, int oif,
 728                             struct tcp_md5sig_key *key,
 729                             int reply_flags, u8 tos)
 730 {
 731         const struct tcphdr *th = tcp_hdr(skb);
 732         struct {
 733                 struct tcphdr th;
 734                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 735 #ifdef CONFIG_TCP_MD5SIG
 736                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 737 #endif
 738                         ];
 739         } rep;
 740         struct net *net = sock_net(sk);
 741         struct ip_reply_arg arg;
 742
 743         memset(&rep.th, 0, sizeof(struct tcphdr));
 744         memset(&arg, 0, sizeof(arg));
 745
 746         arg.iov[0].iov_base = (unsigned char *)&rep;
 747         arg.iov[0].iov_len  = sizeof(rep.th);
 748         if (tsecr) {
 749                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 750                                    (TCPOPT_TIMESTAMP << 8) |
 751                                    TCPOLEN_TIMESTAMP);
 752                 rep.opt[1] = htonl(tsval);
 753                 rep.opt[2] = htonl(tsecr);
 754                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 755         }
 756
 757         /* Swap the send and the receive. */
 758         rep.th.dest    = th->source;
 759         rep.th.source  = th->dest;
 760         rep.th.doff    = arg.iov[0].iov_len / 4;
 761         rep.th.seq     = htonl(seq);
 762         rep.th.ack_seq = htonl(ack);
 763         rep.th.ack     = 1;
 764         rep.th.window  = htons(win);
 765
 766 #ifdef CONFIG_TCP_MD5SIG
 767         if (key) {
 768                 int offset = (tsecr) ? 3 : 0;
 769
 770                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 771                                           (TCPOPT_NOP << 16) |
 772                                           (TCPOPT_MD5SIG << 8) |
 773                                           TCPOLEN_MD5SIG);
 774                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 775                 rep.th.doff = arg.iov[0].iov_len/4;
 776
 777                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 778                                     key, ip_hdr(skb)->saddr,
 779                                     ip_hdr(skb)->daddr, &rep.th);
 780         }
 781 #endif
 782         arg.flags = reply_flags;
 783         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 784                                       ip_hdr(skb)->saddr, /* XXX */
 785                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 786         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 787         if (oif)
 788                 arg.bound_dev_if = oif;
 789         arg.tos = tos;
 790         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 791         local_bh_disable();
 792         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 793                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 794                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 795                               &arg, arg.iov[0].iov_len);
 796
 797         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 798         local_bh_enable();
 799 }
 800
 801 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 802 {
 803         struct inet_timewait_sock *tw = inet_twsk(sk);
 804         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 805
 806         tcp_v4_send_ack(sk, skb,
 807                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 808                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 809                         tcp_time_stamp + tcptw->tw_ts_offset,
 810                         tcptw->tw_ts_recent,
 811                         tw->tw_bound_dev_if,
 812                         tcp_twsk_md5_key(tcptw),
 813                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 814                         tw->tw_tos
 815                         );
 816
 817         inet_twsk_put(tw);
 818 }
 819
 820 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 821                                   struct request_sock *req)
 822 {
 823         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 824          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 825          */
 826         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 827                                              tcp_sk(sk)->snd_nxt;
 828
 829         /* RFC 7323 2.3
 830          * The window field (SEG.WND) of every outgoing segment, with the
 831          * exception of <SYN> segments, MUST be right-shifted by
 832          * Rcv.Wind.Shift bits:
 833          */
 834         tcp_v4_send_ack(sk, skb, seq,
 835                         tcp_rsk(req)->rcv_nxt,
 836                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 837                         tcp_time_stamp + tcp_rsk(req)->ts_off,
 838                         req->ts_recent,
 839                         0,
 840                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 841                                           AF_INET),
 842                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 843                         ip_hdr(skb)->tos);
 844 }
 845
 846 /*
 847  *      Send a SYN-ACK after having received a SYN.
 848  *      This still operates on a request_sock only, not on a big
 849  *      socket.
 850  */
 851 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 852                               struct flowi *fl,
 853                               struct request_sock *req,
 854                               struct tcp_fastopen_cookie *foc,
 855                               enum tcp_synack_type synack_type)
 856 {
 857         const struct inet_request_sock *ireq = inet_rsk(req);
 858         struct flowi4 fl4;
 859         int err = -1;
 860         struct sk_buff *skb;
 861
 862         /* First, grab a route. */
 863         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 864                 return -1;
 865
 866         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 867
 868         if (skb) {
 869                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 870
 871                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 872                                             ireq->ir_rmt_addr,
 873                                             ireq->opt);
 874                 err = net_xmit_eval(err);
 875         }
 876
 877         return err;
 878 }
 879
 880 /*
 881  *      IPv4 request_sock destructor.
 882  */
 883 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 884 {
 885         kfree(inet_rsk(req)->opt);
 886 }
 887
 888 #ifdef CONFIG_TCP_MD5SIG
 889 /*
 890  * RFC2385 MD5 checksumming requires a mapping of
 891  * IP address->MD5 Key.
 892  * We need to maintain these in the sk structure.
 893  */
 894
 895 /* Find the Key structure for an address.  */
 896 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 897                                          const union tcp_md5_addr *addr,
 898                                          int family)
 899 {
 900         const struct tcp_sock *tp = tcp_sk(sk);
 901         struct tcp_md5sig_key *key;
 902         unsigned int size = sizeof(struct in_addr);
 903         const struct tcp_md5sig_info *md5sig;
 904
 905         /* caller either holds rcu_read_lock() or socket lock */
 906         md5sig = rcu_dereference_check(tp->md5sig_info,
 907                                        lockdep_sock_is_held(sk));
 908         if (!md5sig)
 909                 return NULL;
 910 #if IS_ENABLED(CONFIG_IPV6)
 911         if (family == AF_INET6)
 912                 size = sizeof(struct in6_addr);
 913 #endif
 914         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 915                 if (key->family != family)
 916                         continue;
 917                 if (!memcmp(&key->addr, addr, size))
 918                         return key;
 919         }
 920         return NULL;
 921 }
 922 EXPORT_SYMBOL(tcp_md5_do_lookup);
 923
 924 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 925                                          const struct sock *addr_sk)
 926 {
 927         const union tcp_md5_addr *addr;
 928
 929         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 930         return tcp_md5_do_lookup(sk, addr, AF_INET);
 931 }
 932 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 933
 934 /* This can be called on a newly created socket, from other files */
 935 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 936                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 937 {
 938         /* Add Key to the list */
 939         struct tcp_md5sig_key *key;
 940         struct tcp_sock *tp = tcp_sk(sk);
 941         struct tcp_md5sig_info *md5sig;
 942
 943         key = tcp_md5_do_lookup(sk, addr, family);
 944         if (key) {
 945                 /* Pre-existing entry - just update that one. */
 946                 memcpy(key->key, newkey, newkeylen);
 947                 key->keylen = newkeylen;
 948                 return 0;
 949         }
 950
 951         md5sig = rcu_dereference_protected(tp->md5sig_info,
 952                                            lockdep_sock_is_held(sk));
 953         if (!md5sig) {
 954                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 955                 if (!md5sig)
 956                         return -ENOMEM;
 957
 958                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 959                 INIT_HLIST_HEAD(&md5sig->head);
 960                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 961         }
 962
 963         key = sock_kmalloc(sk, sizeof(*key), gfp);
 964         if (!key)
 965                 return -ENOMEM;
 966         if (!tcp_alloc_md5sig_pool()) {
 967                 sock_kfree_s(sk, key, sizeof(*key));
 968                 return -ENOMEM;
 969         }
 970
 971         memcpy(key->key, newkey, newkeylen);
 972         key->keylen = newkeylen;
 973         key->family = family;
 974         memcpy(&key->addr, addr,
 975                (family == AF_INET6) ? sizeof(struct in6_addr) :
 976                                       sizeof(struct in_addr));
 977         hlist_add_head_rcu(&key->node, &md5sig->head);
 978         return 0;
 979 }
 980 EXPORT_SYMBOL(tcp_md5_do_add);
 981
 982 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 983 {
 984         struct tcp_md5sig_key *key;
 985
 986         key = tcp_md5_do_lookup(sk, addr, family);
 987         if (!key)
 988                 return -ENOENT;
 989         hlist_del_rcu(&key->node);
 990         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 991         kfree_rcu(key, rcu);
 992         return 0;
 993 }
 994 EXPORT_SYMBOL(tcp_md5_do_del);
 995
 996 static void tcp_clear_md5_list(struct sock *sk)
 997 {
 998         struct tcp_sock *tp = tcp_sk(sk);
 999         struct tcp_md5sig_key *key;
1000         struct hlist_node *n;
1001         struct tcp_md5sig_info *md5sig;
1002
1003         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1004
1005         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1006                 hlist_del_rcu(&key->node);
1007                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1008                 kfree_rcu(key, rcu);
1009         }
1010 }
1011
1012 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1013                                  int optlen)
1014 {
1015         struct tcp_md5sig cmd;
1016         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1017
1018         if (optlen < sizeof(cmd))
1019                 return -EINVAL;
1020
1021         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1022                 return -EFAULT;
1023
1024         if (sin->sin_family != AF_INET)
1025                 return -EINVAL;
1026
1027         if (!cmd.tcpm_keylen)
1028                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029                                       AF_INET);
1030
1031         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1032                 return -EINVAL;
1033
1034         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1035                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1036                               GFP_KERNEL);
1037 }
1038
1039 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1040                                    __be32 daddr, __be32 saddr,
1041                                    const struct tcphdr *th, int nbytes)
1042 {
1043         struct tcp4_pseudohdr *bp;
1044         struct scatterlist sg;
1045         struct tcphdr *_th;
1046
1047         bp = hp->scratch;
1048         bp->saddr = saddr;
1049         bp->daddr = daddr;
1050         bp->pad = 0;
1051         bp->protocol = IPPROTO_TCP;
1052         bp->len = cpu_to_be16(nbytes);
1053
1054         _th = (struct tcphdr *)(bp + 1);
1055         memcpy(_th, th, sizeof(*th));
1056         _th->check = 0;
1057
1058         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1059         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1060                                 sizeof(*bp) + sizeof(*th));
1061         return crypto_ahash_update(hp->md5_req);
1062 }
1063
1064 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1065                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1066 {
1067         struct tcp_md5sig_pool *hp;
1068         struct ahash_request *req;
1069
1070         hp = tcp_get_md5sig_pool();
1071         if (!hp)
1072                 goto clear_hash_noput;
1073         req = hp->md5_req;
1074
1075         if (crypto_ahash_init(req))
1076                 goto clear_hash;
1077         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1078                 goto clear_hash;
1079         if (tcp_md5_hash_key(hp, key))
1080                 goto clear_hash;
1081         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1082         if (crypto_ahash_final(req))
1083                 goto clear_hash;
1084
1085         tcp_put_md5sig_pool();
1086         return 0;
1087
1088 clear_hash:
1089         tcp_put_md5sig_pool();
1090 clear_hash_noput:
1091         memset(md5_hash, 0, 16);
1092         return 1;
1093 }
1094
1095 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1096                         const struct sock *sk,
1097                         const struct sk_buff *skb)
1098 {
1099         struct tcp_md5sig_pool *hp;
1100         struct ahash_request *req;
1101         const struct tcphdr *th = tcp_hdr(skb);
1102         __be32 saddr, daddr;
1103
1104         if (sk) { /* valid for establish/request sockets */
1105                 saddr = sk->sk_rcv_saddr;
1106                 daddr = sk->sk_daddr;
1107         } else {
1108                 const struct iphdr *iph = ip_hdr(skb);
1109                 saddr = iph->saddr;
1110                 daddr = iph->daddr;
1111         }
1112
1113         hp = tcp_get_md5sig_pool();
1114         if (!hp)
1115                 goto clear_hash_noput;
1116         req = hp->md5_req;
1117
1118         if (crypto_ahash_init(req))
1119                 goto clear_hash;
1120
1121         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_key(hp, key))
1126                 goto clear_hash;
1127         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1128         if (crypto_ahash_final(req))
1129                 goto clear_hash;
1130
1131         tcp_put_md5sig_pool();
1132         return 0;
1133
1134 clear_hash:
1135         tcp_put_md5sig_pool();
1136 clear_hash_noput:
1137         memset(md5_hash, 0, 16);
1138         return 1;
1139 }
1140 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1141
1142 #endif
1143
1144 /* Called with rcu_read_lock() */
1145 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1146                                     const struct sk_buff *skb)
1147 {
1148 #ifdef CONFIG_TCP_MD5SIG
1149         /*
1150          * This gets called for each TCP segment that arrives
1151          * so we want to be efficient.
1152          * We have 3 drop cases:
1153          * o No MD5 hash and one expected.
1154          * o MD5 hash and we're not expecting one.
1155          * o MD5 hash and its wrong.
1156          */
1157         const __u8 *hash_location = NULL;
1158         struct tcp_md5sig_key *hash_expected;
1159         const struct iphdr *iph = ip_hdr(skb);
1160         const struct tcphdr *th = tcp_hdr(skb);
1161         int genhash;
1162         unsigned char newhash[16];
1163
1164         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1165                                           AF_INET);
1166         hash_location = tcp_parse_md5sig_option(th);
1167
1168         /* We've parsed the options - do we have a hash? */
1169         if (!hash_expected && !hash_location)
1170                 return false;
1171
1172         if (hash_expected && !hash_location) {
1173                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1174                 return true;
1175         }
1176
1177         if (!hash_expected && hash_location) {
1178                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1179                 return true;
1180         }
1181
1182         /* Okay, so this is hash_expected and hash_location -
1183          * so we need to calculate the checksum.
1184          */
1185         genhash = tcp_v4_md5_hash_skb(newhash,
1186                                       hash_expected,
1187                                       NULL, skb);
1188
1189         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1190                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1191                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1192                                      &iph->saddr, ntohs(th->source),
1193                                      &iph->daddr, ntohs(th->dest),
1194                                      genhash ? " tcp_v4_calc_md5_hash failed"
1195                                      : "");
1196                 return true;
1197         }
1198         return false;
1199 #endif
1200         return false;
1201 }
1202
1203 static void tcp_v4_init_req(struct request_sock *req,
1204                             const struct sock *sk_listener,
1205                             struct sk_buff *skb)
1206 {
1207         struct inet_request_sock *ireq = inet_rsk(req);
1208
1209         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1210         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1211         ireq->opt = tcp_v4_save_options(skb);
1212 }
1213
1214 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1215                                           struct flowi *fl,
1216                                           const struct request_sock *req,
1217                                           bool *strict)
1218 {
1219         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1220
1221         if (strict) {
1222                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1223                         *strict = true;
1224                 else
1225                         *strict = false;
1226         }
1227
1228         return dst;
1229 }
1230
1231 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1232         .family         =       PF_INET,
1233         .obj_size       =       sizeof(struct tcp_request_sock),
1234         .rtx_syn_ack    =       tcp_rtx_synack,
1235         .send_ack       =       tcp_v4_reqsk_send_ack,
1236         .destructor     =       tcp_v4_reqsk_destructor,
1237         .send_reset     =       tcp_v4_send_reset,
1238         .syn_ack_timeout =      tcp_syn_ack_timeout,
1239 };
1240
1241 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1242         .mss_clamp      =       TCP_MSS_DEFAULT,
1243 #ifdef CONFIG_TCP_MD5SIG
1244         .req_md5_lookup =       tcp_v4_md5_lookup,
1245         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1246 #endif
1247         .init_req       =       tcp_v4_init_req,
1248 #ifdef CONFIG_SYN_COOKIES
1249         .cookie_init_seq =      cookie_v4_init_sequence,
1250 #endif
1251         .route_req      =       tcp_v4_route_req,
1252         .init_seq       =       tcp_v4_init_sequence,
1253         .send_synack    =       tcp_v4_send_synack,
1254 };
1255
1256 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1257 {
1258         /* Never answer to SYNs send to broadcast or multicast */
1259         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1260                 goto drop;
1261
1262         return tcp_conn_request(&tcp_request_sock_ops,
1263                                 &tcp_request_sock_ipv4_ops, sk, skb);
1264
1265 drop:
1266         tcp_listendrop(sk);
1267         return 0;
1268 }
1269 EXPORT_SYMBOL(tcp_v4_conn_request);
1270
1271
1272 /*
1273  * The three way handshake has completed - we got a valid synack -
1274  * now create the new socket.
1275  */
1276 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1277                                   struct request_sock *req,
1278                                   struct dst_entry *dst,
1279                                   struct request_sock *req_unhash,
1280                                   bool *own_req)
1281 {
1282         struct inet_request_sock *ireq;
1283         struct inet_sock *newinet;
1284         struct tcp_sock *newtp;
1285         struct sock *newsk;
1286 #ifdef CONFIG_TCP_MD5SIG
1287         struct tcp_md5sig_key *key;
1288 #endif
1289         struct ip_options_rcu *inet_opt;
1290
1291         if (sk_acceptq_is_full(sk))
1292                 goto exit_overflow;
1293
1294         newsk = tcp_create_openreq_child(sk, req, skb);
1295         if (!newsk)
1296                 goto exit_nonewsk;
1297
1298         newsk->sk_gso_type = SKB_GSO_TCPV4;
1299         inet_sk_rx_dst_set(newsk, skb);
1300
1301         newtp                 = tcp_sk(newsk);
1302         newinet               = inet_sk(newsk);
1303         ireq                  = inet_rsk(req);
1304         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1305         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1306         newsk->sk_bound_dev_if = ireq->ir_iif;
1307         newinet->inet_saddr           = ireq->ir_loc_addr;
1308         inet_opt              = ireq->opt;
1309         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1310         ireq->opt             = NULL;
1311         newinet->mc_index     = inet_iif(skb);
1312         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1313         newinet->rcv_tos      = ip_hdr(skb)->tos;
1314         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1315         if (inet_opt)
1316                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1317         newinet->inet_id = newtp->write_seq ^ jiffies;
1318
1319         if (!dst) {
1320                 dst = inet_csk_route_child_sock(sk, newsk, req);
1321                 if (!dst)
1322                         goto put_and_exit;
1323         } else {
1324                 /* syncookie case : see end of cookie_v4_check() */
1325         }
1326         sk_setup_caps(newsk, dst);
1327
1328         tcp_ca_openreq_child(newsk, dst);
1329
1330         tcp_sync_mss(newsk, dst_mtu(dst));
1331         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1332
1333         tcp_initialize_rcv_mss(newsk);
1334
1335 #ifdef CONFIG_TCP_MD5SIG
1336         /* Copy over the MD5 key from the original socket */
1337         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1338                                 AF_INET);
1339         if (key) {
1340                 /*
1341                  * We're using one, so create a matching key
1342                  * on the newsk structure. If we fail to get
1343                  * memory, then we end up not copying the key
1344                  * across. Shucks.
1345                  */
1346                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1347                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1348                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1349         }
1350 #endif
1351
1352         if (__inet_inherit_port(sk, newsk) < 0)
1353                 goto put_and_exit;
1354         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1355         if (*own_req)
1356                 tcp_move_syn(newtp, req);
1357
1358         return newsk;
1359
1360 exit_overflow:
1361         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1362 exit_nonewsk:
1363         dst_release(dst);
1364 exit:
1365         tcp_listendrop(sk);
1366         return NULL;
1367 put_and_exit:
1368         inet_csk_prepare_forced_close(newsk);
1369         tcp_done(newsk);
1370         goto exit;
1371 }
1372 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1373
1374 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1375 {
1376 #ifdef CONFIG_SYN_COOKIES
1377         const struct tcphdr *th = tcp_hdr(skb);
1378
1379         if (!th->syn)
1380                 sk = cookie_v4_check(sk, skb);
1381 #endif
1382         return sk;
1383 }
1384
1385 /* The socket must have it's spinlock held when we get
1386  * here, unless it is a TCP_LISTEN socket.
1387  *
1388  * We have a potential double-lock case here, so even when
1389  * doing backlog processing we use the BH locking scheme.
1390  * This is because we cannot sleep with the original spinlock
1391  * held.
1392  */
1393 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1394 {
1395         struct sock *rsk;
1396
1397         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1398                 struct dst_entry *dst = sk->sk_rx_dst;
1399
1400                 sock_rps_save_rxhash(sk, skb);
1401                 sk_mark_napi_id(sk, skb);
1402                 if (dst) {
1403                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1404                             !dst->ops->check(dst, 0)) {
1405                                 dst_release(dst);
1406                                 sk->sk_rx_dst = NULL;
1407                         }
1408                 }
1409                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1410                 return 0;
1411         }
1412
1413         if (tcp_checksum_complete(skb))
1414                 goto csum_err;
1415
1416         if (sk->sk_state == TCP_LISTEN) {
1417                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1418
1419                 if (!nsk)
1420                         goto discard;
1421                 if (nsk != sk) {
1422                         sock_rps_save_rxhash(nsk, skb);
1423                         sk_mark_napi_id(nsk, skb);
1424                         if (tcp_child_process(sk, nsk, skb)) {
1425                                 rsk = nsk;
1426                                 goto reset;
1427                         }
1428                         return 0;
1429                 }
1430         } else
1431                 sock_rps_save_rxhash(sk, skb);
1432
1433         if (tcp_rcv_state_process(sk, skb)) {
1434                 rsk = sk;
1435                 goto reset;
1436         }
1437         return 0;
1438
1439 reset:
1440         tcp_v4_send_reset(rsk, skb);
1441 discard:
1442         kfree_skb(skb);
1443         /* Be careful here. If this function gets more complicated and
1444          * gcc suffers from register pressure on the x86, sk (in %ebx)
1445          * might be destroyed here. This current version compiles correctly,
1446          * but you have been warned.
1447          */
1448         return 0;
1449
1450 csum_err:
1451         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1452         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1453         goto discard;
1454 }
1455 EXPORT_SYMBOL(tcp_v4_do_rcv);
1456
1457 void tcp_v4_early_demux(struct sk_buff *skb)
1458 {
1459         const struct iphdr *iph;
1460         const struct tcphdr *th;
1461         struct sock *sk;
1462
1463         if (skb->pkt_type != PACKET_HOST)
1464                 return;
1465
1466         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1467                 return;
1468
1469         iph = ip_hdr(skb);
1470         th = tcp_hdr(skb);
1471
1472         if (th->doff < sizeof(struct tcphdr) / 4)
1473                 return;
1474
1475         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1476                                        iph->saddr, th->source,
1477                                        iph->daddr, ntohs(th->dest),
1478                                        skb->skb_iif);
1479         if (sk) {
1480                 skb->sk = sk;
1481                 skb->destructor = sock_edemux;
1482                 if (sk_fullsock(sk)) {
1483                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1484
1485                         if (dst)
1486                                 dst = dst_check(dst, 0);
1487                         if (dst &&
1488                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1489                                 skb_dst_set_noref(skb, dst);
1490                 }
1491         }
1492 }
1493
1494 /* Packet is added to VJ-style prequeue for processing in process
1495  * context, if a reader task is waiting. Apparently, this exciting
1496  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1497  * failed somewhere. Latency? Burstiness? Well, at least now we will
1498  * see, why it failed. 8)8)                               --ANK
1499  *
1500  */
1501 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1502 {
1503         struct tcp_sock *tp = tcp_sk(sk);
1504
1505         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1506                 return false;
1507
1508         if (skb->len <= tcp_hdrlen(skb) &&
1509             skb_queue_len(&tp->ucopy.prequeue) == 0)
1510                 return false;
1511
1512         /* Before escaping RCU protected region, we need to take care of skb
1513          * dst. Prequeue is only enabled for established sockets.
1514          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1515          * Instead of doing full sk_rx_dst validity here, let's perform
1516          * an optimistic check.
1517          */
1518         if (likely(sk->sk_rx_dst))
1519                 skb_dst_drop(skb);
1520         else
1521                 skb_dst_force_safe(skb);
1522
1523         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1524         tp->ucopy.memory += skb->truesize;
1525         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1526             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1527                 struct sk_buff *skb1;
1528
1529                 BUG_ON(sock_owned_by_user(sk));
1530                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1531                                 skb_queue_len(&tp->ucopy.prequeue));
1532
1533                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1534                         sk_backlog_rcv(sk, skb1);
1535
1536                 tp->ucopy.memory = 0;
1537         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1538                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1539                                            POLLIN | POLLRDNORM | POLLRDBAND);
1540                 if (!inet_csk_ack_scheduled(sk))
1541                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1542                                                   (3 * tcp_rto_min(sk)) / 4,
1543                                                   TCP_RTO_MAX);
1544         }
1545         return true;
1546 }
1547 EXPORT_SYMBOL(tcp_prequeue);
1548
1549 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1550 {
1551         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1552
1553         /* Only socket owner can try to collapse/prune rx queues
1554          * to reduce memory overhead, so add a little headroom here.
1555          * Few sockets backlog are possibly concurrently non empty.
1556          */
1557         limit += 64*1024;
1558
1559         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1560          * we can fix skb->truesize to its real value to avoid future drops.
1561          * This is valid because skb is not yet charged to the socket.
1562          * It has been noticed pure SACK packets were sometimes dropped
1563          * (if cooked by drivers without copybreak feature).
1564          */
1565         skb_condense(skb);
1566
1567         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1568                 bh_unlock_sock(sk);
1569                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1570                 return true;
1571         }
1572         return false;
1573 }
1574 EXPORT_SYMBOL(tcp_add_backlog);
1575
1576 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1577 {
1578         struct tcphdr *th = (struct tcphdr *)skb->data;
1579         unsigned int eaten = skb->len;
1580         int err;
1581
1582         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1583         if (!err) {
1584                 eaten -= skb->len;
1585                 TCP_SKB_CB(skb)->end_seq -= eaten;
1586         }
1587         return err;
1588 }
1589 EXPORT_SYMBOL(tcp_filter);
1590
1591 /*
1592  *      From tcp_input.c
1593  */
1594
1595 int tcp_v4_rcv(struct sk_buff *skb)
1596 {
1597         struct net *net = dev_net(skb->dev);
1598         const struct iphdr *iph;
1599         const struct tcphdr *th;
1600         bool refcounted;
1601         struct sock *sk;
1602         int ret;
1603
1604         if (skb->pkt_type != PACKET_HOST)
1605                 goto discard_it;
1606
1607         /* Count it even if it's bad */
1608         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1609
1610         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1611                 goto discard_it;
1612
1613         th = (const struct tcphdr *)skb->data;
1614
1615         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1616                 goto bad_packet;
1617         if (!pskb_may_pull(skb, th->doff * 4))
1618                 goto discard_it;
1619
1620         /* An explanation is required here, I think.
1621          * Packet length and doff are validated by header prediction,
1622          * provided case of th->doff==0 is eliminated.
1623          * So, we defer the checks. */
1624
1625         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1626                 goto csum_error;
1627
1628         th = (const struct tcphdr *)skb->data;
1629         iph = ip_hdr(skb);
1630         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1631          * barrier() makes sure compiler wont play fool^Waliasing games.
1632          */
1633         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1634                 sizeof(struct inet_skb_parm));
1635         barrier();
1636
1637         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1638         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1639                                     skb->len - th->doff * 4);
1640         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1641         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1642         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1643         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1644         TCP_SKB_CB(skb)->sacked  = 0;
1645
1646 lookup:
1647         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1648                                th->dest, &refcounted);
1649         if (!sk)
1650                 goto no_tcp_socket;
1651
1652 process:
1653         if (sk->sk_state == TCP_TIME_WAIT)
1654                 goto do_time_wait;
1655
1656         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1657                 struct request_sock *req = inet_reqsk(sk);
1658                 struct sock *nsk;
1659
1660                 sk = req->rsk_listener;
1661                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1662                         sk_drops_add(sk, skb);
1663                         reqsk_put(req);
1664                         goto discard_it;
1665                 }
1666                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1667                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1668                         goto lookup;
1669                 }
1670                 /* We own a reference on the listener, increase it again
1671                  * as we might lose it too soon.
1672                  */
1673                 sock_hold(sk);
1674                 refcounted = true;
1675                 nsk = tcp_check_req(sk, skb, req, false);
1676                 if (!nsk) {
1677                         reqsk_put(req);
1678                         goto discard_and_relse;
1679                 }
1680                 if (nsk == sk) {
1681                         reqsk_put(req);
1682                 } else if (tcp_child_process(sk, nsk, skb)) {
1683                         tcp_v4_send_reset(nsk, skb);
1684                         goto discard_and_relse;
1685                 } else {
1686                         sock_put(sk);
1687                         return 0;
1688                 }
1689         }
1690         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1691                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1692                 goto discard_and_relse;
1693         }
1694
1695         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1696                 goto discard_and_relse;
1697
1698         if (tcp_v4_inbound_md5_hash(sk, skb))
1699                 goto discard_and_relse;
1700
1701         nf_reset(skb);
1702
1703         if (tcp_filter(sk, skb))
1704                 goto discard_and_relse;
1705         th = (const struct tcphdr *)skb->data;
1706         iph = ip_hdr(skb);
1707
1708         skb->dev = NULL;
1709
1710         if (sk->sk_state == TCP_LISTEN) {
1711                 ret = tcp_v4_do_rcv(sk, skb);
1712                 goto put_and_return;
1713         }
1714
1715         sk_incoming_cpu_update(sk);
1716
1717         bh_lock_sock_nested(sk);
1718         tcp_segs_in(tcp_sk(sk), skb);
1719         ret = 0;
1720         if (!sock_owned_by_user(sk)) {
1721                 if (!tcp_prequeue(sk, skb))
1722                         ret = tcp_v4_do_rcv(sk, skb);
1723         } else if (tcp_add_backlog(sk, skb)) {
1724                 goto discard_and_relse;
1725         }
1726         bh_unlock_sock(sk);
1727
1728 put_and_return:
1729         if (refcounted)
1730                 sock_put(sk);
1731
1732         return ret;
1733
1734 no_tcp_socket:
1735         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1736                 goto discard_it;
1737
1738         if (tcp_checksum_complete(skb)) {
1739 csum_error:
1740                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1741 bad_packet:
1742                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1743         } else {
1744                 tcp_v4_send_reset(NULL, skb);
1745         }
1746
1747 discard_it:
1748         /* Discard frame. */
1749         kfree_skb(skb);
1750         return 0;
1751
1752 discard_and_relse:
1753         sk_drops_add(sk, skb);
1754         if (refcounted)
1755                 sock_put(sk);
1756         goto discard_it;
1757
1758 do_time_wait:
1759         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1760                 inet_twsk_put(inet_twsk(sk));
1761                 goto discard_it;
1762         }
1763
1764         if (tcp_checksum_complete(skb)) {
1765                 inet_twsk_put(inet_twsk(sk));
1766                 goto csum_error;
1767         }
1768         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1769         case TCP_TW_SYN: {
1770                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1771                                                         &tcp_hashinfo, skb,
1772                                                         __tcp_hdrlen(th),
1773                                                         iph->saddr, th->source,
1774                                                         iph->daddr, th->dest,
1775                                                         inet_iif(skb));
1776                 if (sk2) {
1777                         inet_twsk_deschedule_put(inet_twsk(sk));
1778                         sk = sk2;
1779                         refcounted = false;
1780                         goto process;
1781                 }
1782                 /* Fall through to ACK */
1783         }
1784         case TCP_TW_ACK:
1785                 tcp_v4_timewait_ack(sk, skb);
1786                 break;
1787         case TCP_TW_RST:
1788                 tcp_v4_send_reset(sk, skb);
1789                 inet_twsk_deschedule_put(inet_twsk(sk));
1790                 goto discard_it;
1791         case TCP_TW_SUCCESS:;
1792         }
1793         goto discard_it;
1794 }
1795
1796 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1797         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1798         .twsk_unique    = tcp_twsk_unique,
1799         .twsk_destructor= tcp_twsk_destructor,
1800 };
1801
1802 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1803 {
1804         struct dst_entry *dst = skb_dst(skb);
1805
1806         if (dst && dst_hold_safe(dst)) {
1807                 sk->sk_rx_dst = dst;
1808                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1809         }
1810 }
1811 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1812
1813 const struct inet_connection_sock_af_ops ipv4_specific = {
1814         .queue_xmit        = ip_queue_xmit,
1815         .send_check        = tcp_v4_send_check,
1816         .rebuild_header    = inet_sk_rebuild_header,
1817         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1818         .conn_request      = tcp_v4_conn_request,
1819         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1820         .net_header_len    = sizeof(struct iphdr),
1821         .setsockopt        = ip_setsockopt,
1822         .getsockopt        = ip_getsockopt,
1823         .addr2sockaddr     = inet_csk_addr2sockaddr,
1824         .sockaddr_len      = sizeof(struct sockaddr_in),
1825 #ifdef CONFIG_COMPAT
1826         .compat_setsockopt = compat_ip_setsockopt,
1827         .compat_getsockopt = compat_ip_getsockopt,
1828 #endif
1829         .mtu_reduced       = tcp_v4_mtu_reduced,
1830 };
1831 EXPORT_SYMBOL(ipv4_specific);
1832
1833 #ifdef CONFIG_TCP_MD5SIG
1834 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1835         .md5_lookup             = tcp_v4_md5_lookup,
1836         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1837         .md5_parse              = tcp_v4_parse_md5_keys,
1838 };
1839 #endif
1840
1841 /* NOTE: A lot of things set to zero explicitly by call to
1842  *       sk_alloc() so need not be done here.
1843  */
1844 static int tcp_v4_init_sock(struct sock *sk)
1845 {
1846         struct inet_connection_sock *icsk = inet_csk(sk);
1847
1848         tcp_init_sock(sk);
1849
1850         icsk->icsk_af_ops = &ipv4_specific;
1851
1852 #ifdef CONFIG_TCP_MD5SIG
1853         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1854 #endif
1855
1856         return 0;
1857 }
1858
1859 void tcp_v4_destroy_sock(struct sock *sk)
1860 {
1861         struct tcp_sock *tp = tcp_sk(sk);
1862
1863         tcp_clear_xmit_timers(sk);
1864
1865         tcp_cleanup_congestion_control(sk);
1866
1867         /* Cleanup up the write buffer. */
1868         tcp_write_queue_purge(sk);
1869
1870         /* Cleans up our, hopefully empty, out_of_order_queue. */
1871         skb_rbtree_purge(&tp->out_of_order_queue);
1872
1873 #ifdef CONFIG_TCP_MD5SIG
1874         /* Clean up the MD5 key list, if any */
1875         if (tp->md5sig_info) {
1876                 tcp_clear_md5_list(sk);
1877                 kfree_rcu(tp->md5sig_info, rcu);
1878                 tp->md5sig_info = NULL;
1879         }
1880 #endif
1881
1882         /* Clean prequeue, it must be empty really */
1883         __skb_queue_purge(&tp->ucopy.prequeue);
1884
1885         /* Clean up a referenced TCP bind bucket. */
1886         if (inet_csk(sk)->icsk_bind_hash)
1887                 inet_put_port(sk);
1888
1889         BUG_ON(tp->fastopen_rsk);
1890
1891         /* If socket is aborted during connect operation */
1892         tcp_free_fastopen_req(tp);
1893         tcp_saved_syn_free(tp);
1894
1895         sk_sockets_allocated_dec(sk);
1896 }
1897 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1898
1899 #ifdef CONFIG_PROC_FS
1900 /* Proc filesystem TCP sock list dumping. */
1901
1902 /*
1903  * Get next listener socket follow cur.  If cur is NULL, get first socket
1904  * starting from bucket given in st->bucket; when st->bucket is zero the
1905  * very first socket in the hash table is returned.
1906  */
1907 static void *listening_get_next(struct seq_file *seq, void *cur)
1908 {
1909         struct tcp_iter_state *st = seq->private;
1910         struct net *net = seq_file_net(seq);
1911         struct inet_listen_hashbucket *ilb;
1912         struct sock *sk = cur;
1913
1914         if (!sk) {
1915 get_head:
1916                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1917                 spin_lock(&ilb->lock);
1918                 sk = sk_head(&ilb->head);
1919                 st->offset = 0;
1920                 goto get_sk;
1921         }
1922         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1923         ++st->num;
1924         ++st->offset;
1925
1926         sk = sk_next(sk);
1927 get_sk:
1928         sk_for_each_from(sk) {
1929                 if (!net_eq(sock_net(sk), net))
1930                         continue;
1931                 if (sk->sk_family == st->family)
1932                         return sk;
1933         }
1934         spin_unlock(&ilb->lock);
1935         st->offset = 0;
1936         if (++st->bucket < INET_LHTABLE_SIZE)
1937                 goto get_head;
1938         return NULL;
1939 }
1940
1941 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1942 {
1943         struct tcp_iter_state *st = seq->private;
1944         void *rc;
1945
1946         st->bucket = 0;
1947         st->offset = 0;
1948         rc = listening_get_next(seq, NULL);
1949
1950         while (rc && *pos) {
1951                 rc = listening_get_next(seq, rc);
1952                 --*pos;
1953         }
1954         return rc;
1955 }
1956
1957 static inline bool empty_bucket(const struct tcp_iter_state *st)
1958 {
1959         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1960 }
1961
1962 /*
1963  * Get first established socket starting from bucket given in st->bucket.
1964  * If st->bucket is zero, the very first socket in the hash is returned.
1965  */
1966 static void *established_get_first(struct seq_file *seq)
1967 {
1968         struct tcp_iter_state *st = seq->private;
1969         struct net *net = seq_file_net(seq);
1970         void *rc = NULL;
1971
1972         st->offset = 0;
1973         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1974                 struct sock *sk;
1975                 struct hlist_nulls_node *node;
1976                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1977
1978                 /* Lockless fast path for the common case of empty buckets */
1979                 if (empty_bucket(st))
1980                         continue;
1981
1982                 spin_lock_bh(lock);
1983                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1984                         if (sk->sk_family != st->family ||
1985                             !net_eq(sock_net(sk), net)) {
1986                                 continue;
1987                         }
1988                         rc = sk;
1989                         goto out;
1990                 }
1991                 spin_unlock_bh(lock);
1992         }
1993 out:
1994         return rc;
1995 }
1996
1997 static void *established_get_next(struct seq_file *seq, void *cur)
1998 {
1999         struct sock *sk = cur;
2000         struct hlist_nulls_node *node;
2001         struct tcp_iter_state *st = seq->private;
2002         struct net *net = seq_file_net(seq);
2003
2004         ++st->num;
2005         ++st->offset;
2006
2007         sk = sk_nulls_next(sk);
2008
2009         sk_nulls_for_each_from(sk, node) {
2010                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2011                         return sk;
2012         }
2013
2014         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2015         ++st->bucket;
2016         return established_get_first(seq);
2017 }
2018
2019 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2020 {
2021         struct tcp_iter_state *st = seq->private;
2022         void *rc;
2023
2024         st->bucket = 0;
2025         rc = established_get_first(seq);
2026
2027         while (rc && pos) {
2028                 rc = established_get_next(seq, rc);
2029                 --pos;
2030         }
2031         return rc;
2032 }
2033
2034 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2035 {
2036         void *rc;
2037         struct tcp_iter_state *st = seq->private;
2038
2039         st->state = TCP_SEQ_STATE_LISTENING;
2040         rc        = listening_get_idx(seq, &pos);
2041
2042         if (!rc) {
2043                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2044                 rc        = established_get_idx(seq, pos);
2045         }
2046
2047         return rc;
2048 }
2049
2050 static void *tcp_seek_last_pos(struct seq_file *seq)
2051 {
2052         struct tcp_iter_state *st = seq->private;
2053         int offset = st->offset;
2054         int orig_num = st->num;
2055         void *rc = NULL;
2056
2057         switch (st->state) {
2058         case TCP_SEQ_STATE_LISTENING:
2059                 if (st->bucket >= INET_LHTABLE_SIZE)
2060                         break;
2061                 st->state = TCP_SEQ_STATE_LISTENING;
2062                 rc = listening_get_next(seq, NULL);
2063                 while (offset-- && rc)
2064                         rc = listening_get_next(seq, rc);
2065                 if (rc)
2066                         break;
2067                 st->bucket = 0;
2068                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2069                 /* Fallthrough */
2070         case TCP_SEQ_STATE_ESTABLISHED:
2071                 if (st->bucket > tcp_hashinfo.ehash_mask)
2072                         break;
2073                 rc = established_get_first(seq);
2074                 while (offset-- && rc)
2075                         rc = established_get_next(seq, rc);
2076         }
2077
2078         st->num = orig_num;
2079
2080         return rc;
2081 }
2082
2083 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2084 {
2085         struct tcp_iter_state *st = seq->private;
2086         void *rc;
2087
2088         if (*pos && *pos == st->last_pos) {
2089                 rc = tcp_seek_last_pos(seq);
2090                 if (rc)
2091                         goto out;
2092         }
2093
2094         st->state = TCP_SEQ_STATE_LISTENING;
2095         st->num = 0;
2096         st->bucket = 0;
2097         st->offset = 0;
2098         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2099
2100 out:
2101         st->last_pos = *pos;
2102         return rc;
2103 }
2104
2105 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2106 {
2107         struct tcp_iter_state *st = seq->private;
2108         void *rc = NULL;
2109
2110         if (v == SEQ_START_TOKEN) {
2111                 rc = tcp_get_idx(seq, 0);
2112                 goto out;
2113         }
2114
2115         switch (st->state) {
2116         case TCP_SEQ_STATE_LISTENING:
2117                 rc = listening_get_next(seq, v);
2118                 if (!rc) {
2119                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2120                         st->bucket = 0;
2121                         st->offset = 0;
2122                         rc        = established_get_first(seq);
2123                 }
2124                 break;
2125         case TCP_SEQ_STATE_ESTABLISHED:
2126                 rc = established_get_next(seq, v);
2127                 break;
2128         }
2129 out:
2130         ++*pos;
2131         st->last_pos = *pos;
2132         return rc;
2133 }
2134
2135 static void tcp_seq_stop(struct seq_file *seq, void *v)
2136 {
2137         struct tcp_iter_state *st = seq->private;
2138
2139         switch (st->state) {
2140         case TCP_SEQ_STATE_LISTENING:
2141                 if (v != SEQ_START_TOKEN)
2142                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2143                 break;
2144         case TCP_SEQ_STATE_ESTABLISHED:
2145                 if (v)
2146                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2147                 break;
2148         }
2149 }
2150
2151 int tcp_seq_open(struct inode *inode, struct file *file)
2152 {
2153         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2154         struct tcp_iter_state *s;
2155         int err;
2156
2157         err = seq_open_net(inode, file, &afinfo->seq_ops,
2158                           sizeof(struct tcp_iter_state));
2159         if (err < 0)
2160                 return err;
2161
2162         s = ((struct seq_file *)file->private_data)->private;
2163         s->family               = afinfo->family;
2164         s->last_pos             = 0;
2165         return 0;
2166 }
2167 EXPORT_SYMBOL(tcp_seq_open);
2168
2169 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2170 {
2171         int rc = 0;
2172         struct proc_dir_entry *p;
2173
2174         afinfo->seq_ops.start           = tcp_seq_start;
2175         afinfo->seq_ops.next            = tcp_seq_next;
2176         afinfo->seq_ops.stop            = tcp_seq_stop;
2177
2178         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2179                              afinfo->seq_fops, afinfo);
2180         if (!p)
2181                 rc = -ENOMEM;
2182         return rc;
2183 }
2184 EXPORT_SYMBOL(tcp_proc_register);
2185
2186 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2187 {
2188         remove_proc_entry(afinfo->name, net->proc_net);
2189 }
2190 EXPORT_SYMBOL(tcp_proc_unregister);
2191
2192 static void get_openreq4(const struct request_sock *req,
2193                          struct seq_file *f, int i)
2194 {
2195         const struct inet_request_sock *ireq = inet_rsk(req);
2196         long delta = req->rsk_timer.expires - jiffies;
2197
2198         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2199                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2200                 i,
2201                 ireq->ir_loc_addr,
2202                 ireq->ir_num,
2203                 ireq->ir_rmt_addr,
2204                 ntohs(ireq->ir_rmt_port),
2205                 TCP_SYN_RECV,
2206                 0, 0, /* could print option size, but that is af dependent. */
2207                 1,    /* timers active (only the expire timer) */
2208                 jiffies_delta_to_clock_t(delta),
2209                 req->num_timeout,
2210                 from_kuid_munged(seq_user_ns(f),
2211                                  sock_i_uid(req->rsk_listener)),
2212                 0,  /* non standard timer */
2213                 0, /* open_requests have no inode */
2214                 0,
2215                 req);
2216 }
2217
2218 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2219 {
2220         int timer_active;
2221         unsigned long timer_expires;
2222         const struct tcp_sock *tp = tcp_sk(sk);
2223         const struct inet_connection_sock *icsk = inet_csk(sk);
2224         const struct inet_sock *inet = inet_sk(sk);
2225         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2226         __be32 dest = inet->inet_daddr;
2227         __be32 src = inet->inet_rcv_saddr;
2228         __u16 destp = ntohs(inet->inet_dport);
2229         __u16 srcp = ntohs(inet->inet_sport);
2230         int rx_queue;
2231         int state;
2232
2233         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2234             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2235             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2236                 timer_active    = 1;
2237                 timer_expires   = icsk->icsk_timeout;
2238         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2239                 timer_active    = 4;
2240                 timer_expires   = icsk->icsk_timeout;
2241         } else if (timer_pending(&sk->sk_timer)) {
2242                 timer_active    = 2;
2243                 timer_expires   = sk->sk_timer.expires;
2244         } else {
2245                 timer_active    = 0;
2246                 timer_expires = jiffies;
2247         }
2248
2249         state = sk_state_load(sk);
2250         if (state == TCP_LISTEN)
2251                 rx_queue = sk->sk_ack_backlog;
2252         else
2253                 /* Because we don't lock the socket,
2254                  * we might find a transient negative value.
2255                  */
2256                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2257
2258         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2259                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2260                 i, src, srcp, dest, destp, state,
2261                 tp->write_seq - tp->snd_una,
2262                 rx_queue,
2263                 timer_active,
2264                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2265                 icsk->icsk_retransmits,
2266                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2267                 icsk->icsk_probes_out,
2268                 sock_i_ino(sk),
2269                 atomic_read(&sk->sk_refcnt), sk,
2270                 jiffies_to_clock_t(icsk->icsk_rto),
2271                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2272                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2273                 tp->snd_cwnd,
2274                 state == TCP_LISTEN ?
2275                     fastopenq->max_qlen :
2276                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2277 }
2278
2279 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2280                                struct seq_file *f, int i)
2281 {
2282         long delta = tw->tw_timer.expires - jiffies;
2283         __be32 dest, src;
2284         __u16 destp, srcp;
2285
2286         dest  = tw->tw_daddr;
2287         src   = tw->tw_rcv_saddr;
2288         destp = ntohs(tw->tw_dport);
2289         srcp  = ntohs(tw->tw_sport);
2290
2291         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2292                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2293                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2294                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2295                 atomic_read(&tw->tw_refcnt), tw);
2296 }
2297
2298 #define TMPSZ 150
2299
2300 static int tcp4_seq_show(struct seq_file *seq, void *v)
2301 {
2302         struct tcp_iter_state *st;
2303         struct sock *sk = v;
2304
2305         seq_setwidth(seq, TMPSZ - 1);
2306         if (v == SEQ_START_TOKEN) {
2307                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2308                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2309                            "inode");
2310                 goto out;
2311         }
2312         st = seq->private;
2313
2314         if (sk->sk_state == TCP_TIME_WAIT)
2315                 get_timewait4_sock(v, seq, st->num);
2316         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2317                 get_openreq4(v, seq, st->num);
2318         else
2319                 get_tcp4_sock(v, seq, st->num);
2320 out:
2321         seq_pad(seq, '\n');
2322         return 0;
2323 }
2324
2325 static const struct file_operations tcp_afinfo_seq_fops = {
2326         .owner   = THIS_MODULE,
2327         .open    = tcp_seq_open,
2328         .read    = seq_read,
2329         .llseek  = seq_lseek,
2330         .release = seq_release_net
2331 };
2332
2333 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2334         .name           = "tcp",
2335         .family         = AF_INET,
2336         .seq_fops       = &tcp_afinfo_seq_fops,
2337         .seq_ops        = {
2338                 .show           = tcp4_seq_show,
2339         },
2340 };
2341
2342 static int __net_init tcp4_proc_init_net(struct net *net)
2343 {
2344         return tcp_proc_register(net, &tcp4_seq_afinfo);
2345 }
2346
2347 static void __net_exit tcp4_proc_exit_net(struct net *net)
2348 {
2349         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2350 }
2351
2352 static struct pernet_operations tcp4_net_ops = {
2353         .init = tcp4_proc_init_net,
2354         .exit = tcp4_proc_exit_net,
2355 };
2356
2357 int __init tcp4_proc_init(void)
2358 {
2359         return register_pernet_subsys(&tcp4_net_ops);
2360 }
2361
2362 void tcp4_proc_exit(void)
2363 {
2364         unregister_pernet_subsys(&tcp4_net_ops);
2365 }
2366 #endif /* CONFIG_PROC_FS */
2367
2368 struct proto tcp_prot = {
2369         .name                   = "TCP",
2370         .owner                  = THIS_MODULE,
2371         .close                  = tcp_close,
2372         .connect                = tcp_v4_connect,
2373         .disconnect             = tcp_disconnect,
2374         .accept                 = inet_csk_accept,
2375         .ioctl                  = tcp_ioctl,
2376         .init                   = tcp_v4_init_sock,
2377         .destroy                = tcp_v4_destroy_sock,
2378         .shutdown               = tcp_shutdown,
2379         .setsockopt             = tcp_setsockopt,
2380         .getsockopt             = tcp_getsockopt,
2381         .keepalive              = tcp_set_keepalive,
2382         .recvmsg                = tcp_recvmsg,
2383         .sendmsg                = tcp_sendmsg,
2384         .sendpage               = tcp_sendpage,
2385         .backlog_rcv            = tcp_v4_do_rcv,
2386         .release_cb             = tcp_release_cb,
2387         .hash                   = inet_hash,
2388         .unhash                 = inet_unhash,
2389         .get_port               = inet_csk_get_port,
2390         .enter_memory_pressure  = tcp_enter_memory_pressure,
2391         .stream_memory_free     = tcp_stream_memory_free,
2392         .sockets_allocated      = &tcp_sockets_allocated,
2393         .orphan_count           = &tcp_orphan_count,
2394         .memory_allocated       = &tcp_memory_allocated,
2395         .memory_pressure        = &tcp_memory_pressure,
2396         .sysctl_mem             = sysctl_tcp_mem,
2397         .sysctl_wmem            = sysctl_tcp_wmem,
2398         .sysctl_rmem            = sysctl_tcp_rmem,
2399         .max_header             = MAX_TCP_HEADER,
2400         .obj_size               = sizeof(struct tcp_sock),
2401         .slab_flags             = SLAB_DESTROY_BY_RCU,
2402         .twsk_prot              = &tcp_timewait_sock_ops,
2403         .rsk_prot               = &tcp_request_sock_ops,
2404         .h.hashinfo             = &tcp_hashinfo,
2405         .no_autobind            = true,
2406 #ifdef CONFIG_COMPAT
2407         .compat_setsockopt      = compat_tcp_setsockopt,
2408         .compat_getsockopt      = compat_tcp_getsockopt,
2409 #endif
2410         .diag_destroy           = tcp_abort,
2411 };
2412 EXPORT_SYMBOL(tcp_prot);
2413
2414 static void __net_exit tcp_sk_exit(struct net *net)
2415 {
2416         int cpu;
2417
2418         for_each_possible_cpu(cpu)
2419                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2420         free_percpu(net->ipv4.tcp_sk);
2421 }
2422
2423 static int __net_init tcp_sk_init(struct net *net)
2424 {
2425         int res, cpu, cnt;
2426
2427         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2428         if (!net->ipv4.tcp_sk)
2429                 return -ENOMEM;
2430
2431         for_each_possible_cpu(cpu) {
2432                 struct sock *sk;
2433
2434                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2435                                            IPPROTO_TCP, net);
2436                 if (res)
2437                         goto fail;
2438                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2439                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2440         }
2441
2442         net->ipv4.sysctl_tcp_ecn = 2;
2443         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2444
2445         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2446         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2447         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2448
2449         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2450         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2451         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2452
2453         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2454         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2455         net->ipv4.sysctl_tcp_syncookies = 1;
2456         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2457         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2458         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2459         net->ipv4.sysctl_tcp_orphan_retries = 0;
2460         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2461         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2462         net->ipv4.sysctl_tcp_tw_reuse = 0;
2463
2464         cnt = tcp_hashinfo.ehash_mask + 1;
2465         net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2466         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2467         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2468
2469         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2470
2471         return 0;
2472 fail:
2473         tcp_sk_exit(net);
2474
2475         return res;
2476 }
2477
2478 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2479 {
2480         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2481 }
2482
2483 static struct pernet_operations __net_initdata tcp_sk_ops = {
2484        .init       = tcp_sk_init,
2485        .exit       = tcp_sk_exit,
2486        .exit_batch = tcp_sk_exit_batch,
2487 };
2488
2489 void __init tcp_v4_init(void)
2490 {
2491         if (register_pernet_subsys(&tcp_sk_ops))
2492                 panic("Failed to create the TCP control socket.\n");
2493 }