net/ipv4/tcp_minisocks.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/config.h>
  24 #include <linux/mm.h>
  25 #include <linux/module.h>
  26 #include <linux/sysctl.h>
  27 #include <linux/workqueue.h>
  28 #include <net/tcp.h>
  29 #include <net/inet_common.h>
  30 #include <net/xfrm.h>
  31
  32 #ifdef CONFIG_SYSCTL
  33 #define SYNC_INIT 0 /* let the user enable it */
  34 #else
  35 #define SYNC_INIT 1
  36 #endif
  37
  38 int sysctl_tcp_tw_recycle;
  39 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
  40
  41 int sysctl_tcp_syncookies = SYNC_INIT;
  42 int sysctl_tcp_abort_on_overflow;
  43
  44 static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo);
  45
  46 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  47 {
  48         if (seq == s_win)
  49                 return 1;
  50         if (after(end_seq, s_win) && before(seq, e_win))
  51                 return 1;
  52         return (seq == e_win && seq == end_seq);
  53 }
  54
  55 /* New-style handling of TIME_WAIT sockets. */
  56
  57 int tcp_tw_count;
  58
  59 /*
  60  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  61  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  62  *   (and, probably, tail of data) and one or more our ACKs are lost.
  63  * * What is TIME-WAIT timeout? It is associated with maximal packet
  64  *   lifetime in the internet, which results in wrong conclusion, that
  65  *   it is set to catch "old duplicate segments" wandering out of their path.
  66  *   It is not quite correct. This timeout is calculated so that it exceeds
  67  *   maximal retransmission timeout enough to allow to lose one (or more)
  68  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
  69  * * When TIME-WAIT socket receives RST, it means that another end
  70  *   finally closed and we are allowed to kill TIME-WAIT too.
  71  * * Second purpose of TIME-WAIT is catching old duplicate segments.
  72  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
  73  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
  74  * * If we invented some more clever way to catch duplicates
  75  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
  76  *
  77  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  78  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  79  * from the very beginning.
  80  *
  81  * NOTE. With recycling (and later with fin-wait-2) TW bucket
  82  * is _not_ stateless. It means, that strictly speaking we must
  83  * spinlock it. I do not want! Well, probability of misbehaviour
  84  * is ridiculously low and, seems, we could use some mb() tricks
  85  * to avoid misread sequence numbers, states etc.  --ANK
  86  */
  87 enum tcp_tw_status
  88 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
  89                            const struct tcphdr *th)
  90 {
  91         struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
  92         struct tcp_options_received tmp_opt;
  93         int paws_reject = 0;
  94
  95         tmp_opt.saw_tstamp = 0;
  96         if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
  97                 tcp_parse_options(skb, &tmp_opt, 0);
  98
  99                 if (tmp_opt.saw_tstamp) {
 100                         tmp_opt.ts_recent       = tcptw->tw_ts_recent;
 101                         tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 102                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 103                 }
 104         }
 105
 106         if (tw->tw_substate == TCP_FIN_WAIT2) {
 107                 /* Just repeat all the checks of tcp_rcv_state_process() */
 108
 109                 /* Out of window, send ACK */
 110                 if (paws_reject ||
 111                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 112                                    tcptw->tw_rcv_nxt,
 113                                    tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
 114                         return TCP_TW_ACK;
 115
 116                 if (th->rst)
 117                         goto kill;
 118
 119                 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
 120                         goto kill_with_rst;
 121
 122                 /* Dup ACK? */
 123                 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
 124                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 125                         inet_twsk_put(tw);
 126                         return TCP_TW_SUCCESS;
 127                 }
 128
 129                 /* New data or FIN. If new data arrive after half-duplex close,
 130                  * reset.
 131                  */
 132                 if (!th->fin ||
 133                     TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 134 kill_with_rst:
 135                         tcp_tw_deschedule(tw);
 136                         inet_twsk_put(tw);
 137                         return TCP_TW_RST;
 138                 }
 139
 140                 /* FIN arrived, enter true time-wait state. */
 141                 tw->tw_substate   = TCP_TIME_WAIT;
 142                 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 143                 if (tmp_opt.saw_tstamp) {
 144                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 145                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 146                 }
 147
 148                 /* I am shamed, but failed to make it more elegant.
 149                  * Yes, it is direct reference to IP, which is impossible
 150                  * to generalize to IPv6. Taking into account that IPv6
 151                  * do not undertsnad recycling in any case, it not
 152                  * a big problem in practice. --ANK */
 153                 if (tw->tw_family == AF_INET &&
 154                     sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp &&
 155                     tcp_v4_tw_remember_stamp(tw))
 156                         tcp_tw_schedule(tw, tw->tw_timeout);
 157                 else
 158                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 159                 return TCP_TW_ACK;
 160         }
 161
 162         /*
 163          *      Now real TIME-WAIT state.
 164          *
 165          *      RFC 1122:
 166          *      "When a connection is [...] on TIME-WAIT state [...]
 167          *      [a TCP] MAY accept a new SYN from the remote TCP to
 168          *      reopen the connection directly, if it:
 169          *
 170          *      (1)  assigns its initial sequence number for the new
 171          *      connection to be larger than the largest sequence
 172          *      number it used on the previous connection incarnation,
 173          *      and
 174          *
 175          *      (2)  returns to TIME-WAIT state if the SYN turns out
 176          *      to be an old duplicate".
 177          */
 178
 179         if (!paws_reject &&
 180             (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
 181              (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 182                 /* In window segment, it may be only reset or bare ack. */
 183
 184                 if (th->rst) {
 185                         /* This is TIME_WAIT assasination, in two flavors.
 186                          * Oh well... nobody has a sufficient solution to this
 187                          * protocol bug yet.
 188                          */
 189                         if (sysctl_tcp_rfc1337 == 0) {
 190 kill:
 191                                 tcp_tw_deschedule(tw);
 192                                 inet_twsk_put(tw);
 193                                 return TCP_TW_SUCCESS;
 194                         }
 195                 }
 196                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 197
 198                 if (tmp_opt.saw_tstamp) {
 199                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 200                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 201                 }
 202
 203                 inet_twsk_put(tw);
 204                 return TCP_TW_SUCCESS;
 205         }
 206
 207         /* Out of window segment.
 208
 209            All the segments are ACKed immediately.
 210
 211            The only exception is new SYN. We accept it, if it is
 212            not old duplicate and we are not in danger to be killed
 213            by delayed old duplicates. RFC check is that it has
 214            newer sequence number works at rates <40Mbit/sec.
 215            However, if paws works, it is reliable AND even more,
 216            we even may relax silly seq space cutoff.
 217
 218            RED-PEN: we violate main RFC requirement, if this SYN will appear
 219            old duplicate (i.e. we receive RST in reply to SYN-ACK),
 220            we must return socket to time-wait state. It is not good,
 221            but not fatal yet.
 222          */
 223
 224         if (th->syn && !th->rst && !th->ack && !paws_reject &&
 225             (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
 226              (tmp_opt.saw_tstamp &&
 227               (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
 228                 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 229                 if (isn == 0)
 230                         isn++;
 231                 TCP_SKB_CB(skb)->when = isn;
 232                 return TCP_TW_SYN;
 233         }
 234
 235         if (paws_reject)
 236                 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 237
 238         if(!th->rst) {
 239                 /* In this case we must reset the TIMEWAIT timer.
 240                  *
 241                  * If it is ACKless SYN it may be both old duplicate
 242                  * and new good SYN with random sequence number <rcv_nxt.
 243                  * Do not reschedule in the last case.
 244                  */
 245                 if (paws_reject || th->ack)
 246                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 247
 248                 /* Send ACK. Note, we do not put the bucket,
 249                  * it will be released by caller.
 250                  */
 251                 return TCP_TW_ACK;
 252         }
 253         inet_twsk_put(tw);
 254         return TCP_TW_SUCCESS;
 255 }
 256
 257 /*
 258  * Move a socket to time-wait or dead fin-wait-2 state.
 259  */
 260 void tcp_time_wait(struct sock *sk, int state, int timeo)
 261 {
 262         struct inet_timewait_sock *tw = NULL;
 263         const struct tcp_sock *tp = tcp_sk(sk);
 264         int recycle_ok = 0;
 265
 266         if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
 267                 recycle_ok = tp->af_specific->remember_stamp(sk);
 268
 269         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
 270                 tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, SLAB_ATOMIC);
 271
 272         if (tw != NULL) {
 273                 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 274                 const struct inet_sock *inet = inet_sk(sk);
 275                 const int rto = (tp->rto << 2) - (tp->rto >> 1);
 276
 277                 /* Remember our protocol */
 278                 tw->tw_prot             = sk->sk_prot_creator;
 279
 280                 /* Give us an identity. */
 281                 tw->tw_daddr            = inet->daddr;
 282                 tw->tw_rcv_saddr        = inet->rcv_saddr;
 283                 tw->tw_bound_dev_if     = sk->sk_bound_dev_if;
 284                 tw->tw_num              = inet->num;
 285                 tw->tw_state            = TCP_TIME_WAIT;
 286                 tw->tw_substate         = state;
 287                 tw->tw_sport            = inet->sport;
 288                 tw->tw_dport            = inet->dport;
 289                 tw->tw_family           = sk->sk_family;
 290                 tw->tw_reuse            = sk->sk_reuse;
 291                 tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
 292                 atomic_set(&tw->tw_refcnt, 1);
 293
 294                 tw->tw_hashent          = sk->sk_hashent;
 295                 tcptw->tw_rcv_nxt       = tp->rcv_nxt;
 296                 tcptw->tw_snd_nxt       = tp->snd_nxt;
 297                 tcptw->tw_rcv_wnd       = tcp_receive_window(tp);
 298                 tcptw->tw_ts_recent     = tp->rx_opt.ts_recent;
 299                 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 300                 inet_twsk_dead_node_init(tw);
 301
 302 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 303                 if (tw->tw_family == PF_INET6) {
 304                         struct ipv6_pinfo *np = inet6_sk(sk);
 305                         struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
 306
 307                         ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
 308                         ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
 309                         tw->tw_ipv6only = np->ipv6only;
 310                 } else
 311                         tw->tw_ipv6only = 0;
 312 #endif
 313                 /* Linkage updates. */
 314                 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 315
 316                 /* Get the TIME_WAIT timeout firing. */
 317                 if (timeo < rto)
 318                         timeo = rto;
 319
 320                 if (recycle_ok) {
 321                         tw->tw_timeout = rto;
 322                 } else {
 323                         tw->tw_timeout = TCP_TIMEWAIT_LEN;
 324                         if (state == TCP_TIME_WAIT)
 325                                 timeo = TCP_TIMEWAIT_LEN;
 326                 }
 327
 328                 tcp_tw_schedule(tw, timeo);
 329                 inet_twsk_put(tw);
 330         } else {
 331                 /* Sorry, if we're out of memory, just CLOSE this
 332                  * socket up.  We've got bigger problems than
 333                  * non-graceful socket closings.
 334                  */
 335                 if (net_ratelimit())
 336                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 337         }
 338
 339         tcp_update_metrics(sk);
 340         tcp_done(sk);
 341 }
 342
 343 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
 344 static int tcp_tw_death_row_slot;
 345
 346 static void tcp_twkill(unsigned long);
 347
 348 /* TIME_WAIT reaping mechanism. */
 349 #define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
 350 #define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
 351
 352 #define TCP_TWKILL_QUOTA        100
 353
 354 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
 355 static DEFINE_SPINLOCK(tw_death_lock);
 356 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
 357 static void twkill_work(void *);
 358 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
 359 static u32 twkill_thread_slots;
 360
 361 /* Returns non-zero if quota exceeded.  */
 362 static int tcp_do_twkill_work(int slot, unsigned int quota)
 363 {
 364         struct inet_timewait_sock *tw;
 365         struct hlist_node *node;
 366         unsigned int killed;
 367         int ret;
 368
 369         /* NOTE: compare this to previous version where lock
 370          * was released after detaching chain. It was racy,
 371          * because tw buckets are scheduled in not serialized context
 372          * in 2.3 (with netfilter), and with softnet it is common, because
 373          * soft irqs are not sequenced.
 374          */
 375         killed = 0;
 376         ret = 0;
 377 rescan:
 378         inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
 379                 __inet_twsk_del_dead_node(tw);
 380                 spin_unlock(&tw_death_lock);
 381                 __inet_twsk_kill(tw, &tcp_hashinfo);
 382                 inet_twsk_put(tw);
 383                 killed++;
 384                 spin_lock(&tw_death_lock);
 385                 if (killed > quota) {
 386                         ret = 1;
 387                         break;
 388                 }
 389
 390                 /* While we dropped tw_death_lock, another cpu may have
 391                  * killed off the next TW bucket in the list, therefore
 392                  * do a fresh re-read of the hlist head node with the
 393                  * lock reacquired.  We still use the hlist traversal
 394                  * macro in order to get the prefetches.
 395                  */
 396                 goto rescan;
 397         }
 398
 399         tcp_tw_count -= killed;
 400         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
 401
 402         return ret;
 403 }
 404
 405 static void tcp_twkill(unsigned long dummy)
 406 {
 407         int need_timer, ret;
 408
 409         spin_lock(&tw_death_lock);
 410
 411         if (tcp_tw_count == 0)
 412                 goto out;
 413
 414         need_timer = 0;
 415         ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
 416         if (ret) {
 417                 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
 418                 mb();
 419                 schedule_work(&tcp_twkill_work);
 420                 need_timer = 1;
 421         } else {
 422                 /* We purged the entire slot, anything left?  */
 423                 if (tcp_tw_count)
 424                         need_timer = 1;
 425         }
 426         tcp_tw_death_row_slot =
 427                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 428         if (need_timer)
 429                 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
 430 out:
 431         spin_unlock(&tw_death_lock);
 432 }
 433
 434 extern void twkill_slots_invalid(void);
 435
 436 static void twkill_work(void *dummy)
 437 {
 438         int i;
 439
 440         if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
 441                 twkill_slots_invalid();
 442
 443         while (twkill_thread_slots) {
 444                 spin_lock_bh(&tw_death_lock);
 445                 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
 446                         if (!(twkill_thread_slots & (1 << i)))
 447                                 continue;
 448
 449                         while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
 450                                 if (need_resched()) {
 451                                         spin_unlock_bh(&tw_death_lock);
 452                                         schedule();
 453                                         spin_lock_bh(&tw_death_lock);
 454                                 }
 455                         }
 456
 457                         twkill_thread_slots &= ~(1 << i);
 458                 }
 459                 spin_unlock_bh(&tw_death_lock);
 460         }
 461 }
 462
 463 /* These are always called from BH context.  See callers in
 464  * tcp_input.c to verify this.
 465  */
 466
 467 /* This is for handling early-kills of TIME_WAIT sockets. */
 468 void tcp_tw_deschedule(struct inet_timewait_sock *tw)
 469 {
 470         spin_lock(&tw_death_lock);
 471         if (inet_twsk_del_dead_node(tw)) {
 472                 inet_twsk_put(tw);
 473                 if (--tcp_tw_count == 0)
 474                         del_timer(&tcp_tw_timer);
 475         }
 476         spin_unlock(&tw_death_lock);
 477         __inet_twsk_kill(tw, &tcp_hashinfo);
 478 }
 479
 480 /* Short-time timewait calendar */
 481
 482 static int tcp_twcal_hand = -1;
 483 static int tcp_twcal_jiffie;
 484 static void tcp_twcal_tick(unsigned long);
 485 static struct timer_list tcp_twcal_timer =
 486                 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
 487 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
 488
 489 static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo)
 490 {
 491         struct hlist_head *list;
 492         int slot;
 493
 494         /* timeout := RTO * 3.5
 495          *
 496          * 3.5 = 1+2+0.5 to wait for two retransmits.
 497          *
 498          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
 499          * our ACK acking that FIN can be lost. If N subsequent retransmitted
 500          * FINs (or previous seqments) are lost (probability of such event
 501          * is p^(N+1), where p is probability to lose single packet and
 502          * time to detect the loss is about RTO*(2^N - 1) with exponential
 503          * backoff). Normal timewait length is calculated so, that we
 504          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
 505          * [ BTW Linux. following BSD, violates this requirement waiting
 506          *   only for 60sec, we should wait at least for 240 secs.
 507          *   Well, 240 consumes too much of resources 8)
 508          * ]
 509          * This interval is not reduced to catch old duplicate and
 510          * responces to our wandering segments living for two MSLs.
 511          * However, if we use PAWS to detect
 512          * old duplicates, we can reduce the interval to bounds required
 513          * by RTO, rather than MSL. So, if peer understands PAWS, we
 514          * kill tw bucket after 3.5*RTO (it is important that this number
 515          * is greater than TS tick!) and detect old duplicates with help
 516          * of PAWS.
 517          */
 518         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
 519
 520         spin_lock(&tw_death_lock);
 521
 522         /* Unlink it, if it was scheduled */
 523         if (inet_twsk_del_dead_node(tw))
 524                 tcp_tw_count--;
 525         else
 526                 atomic_inc(&tw->tw_refcnt);
 527
 528         if (slot >= TCP_TW_RECYCLE_SLOTS) {
 529                 /* Schedule to slow timer */
 530                 if (timeo >= TCP_TIMEWAIT_LEN) {
 531                         slot = TCP_TWKILL_SLOTS-1;
 532                 } else {
 533                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
 534                         if (slot >= TCP_TWKILL_SLOTS)
 535                                 slot = TCP_TWKILL_SLOTS-1;
 536                 }
 537                 tw->tw_ttd = jiffies + timeo;
 538                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
 539                 list = &tcp_tw_death_row[slot];
 540         } else {
 541                 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
 542
 543                 if (tcp_twcal_hand < 0) {
 544                         tcp_twcal_hand = 0;
 545                         tcp_twcal_jiffie = jiffies;
 546                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
 547                         add_timer(&tcp_twcal_timer);
 548                 } else {
 549                         if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
 550                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
 551                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
 552                 }
 553                 list = &tcp_twcal_row[slot];
 554         }
 555
 556         hlist_add_head(&tw->tw_death_node, list);
 557
 558         if (tcp_tw_count++ == 0)
 559                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 560         spin_unlock(&tw_death_lock);
 561 }
 562
 563 void tcp_twcal_tick(unsigned long dummy)
 564 {
 565         int n, slot;
 566         unsigned long j;
 567         unsigned long now = jiffies;
 568         int killed = 0;
 569         int adv = 0;
 570
 571         spin_lock(&tw_death_lock);
 572         if (tcp_twcal_hand < 0)
 573                 goto out;
 574
 575         slot = tcp_twcal_hand;
 576         j = tcp_twcal_jiffie;
 577
 578         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
 579                 if (time_before_eq(j, now)) {
 580                         struct hlist_node *node, *safe;
 581                         struct inet_timewait_sock *tw;
 582
 583                         inet_twsk_for_each_inmate_safe(tw, node, safe,
 584                                                        &tcp_twcal_row[slot]) {
 585                                 __inet_twsk_del_dead_node(tw);
 586                                 __inet_twsk_kill(tw, &tcp_hashinfo);
 587                                 inet_twsk_put(tw);
 588                                 killed++;
 589                         }
 590                 } else {
 591                         if (!adv) {
 592                                 adv = 1;
 593                                 tcp_twcal_jiffie = j;
 594                                 tcp_twcal_hand = slot;
 595                         }
 596
 597                         if (!hlist_empty(&tcp_twcal_row[slot])) {
 598                                 mod_timer(&tcp_twcal_timer, j);
 599                                 goto out;
 600                         }
 601                 }
 602                 j += (1<<TCP_TW_RECYCLE_TICK);
 603                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 604         }
 605         tcp_twcal_hand = -1;
 606
 607 out:
 608         if ((tcp_tw_count -= killed) == 0)
 609                 del_timer(&tcp_tw_timer);
 610         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
 611         spin_unlock(&tw_death_lock);
 612 }
 613
 614 /* This is not only more efficient than what we used to do, it eliminates
 615  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 616  *
 617  * Actually, we could lots of memory writes here. tp of listening
 618  * socket contains all necessary default parameters.
 619  */
 620 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 621 {
 622         /* allocate the newsk from the same slab of the master sock,
 623          * if not, at sk_free time we'll try to free it from the wrong
 624          * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
 625         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
 626
 627         if(newsk != NULL) {
 628                 struct inet_request_sock *ireq = inet_rsk(req);
 629                 struct tcp_request_sock *treq = tcp_rsk(req);
 630                 struct inet_sock *newinet = inet_sk(newsk);
 631                 struct tcp_sock *newtp;
 632                 struct sk_filter *filter;
 633
 634                 memcpy(newsk, sk, sizeof(struct tcp_sock));
 635                 newsk->sk_state = TCP_SYN_RECV;
 636
 637                 /* SANITY */
 638                 sk_node_init(&newsk->sk_node);
 639                 newinet->bind_hash = NULL;
 640
 641                 /* Clone the TCP header template */
 642                 newinet->dport = ireq->rmt_port;
 643
 644                 sock_lock_init(newsk);
 645                 bh_lock_sock(newsk);
 646
 647                 rwlock_init(&newsk->sk_dst_lock);
 648                 newsk->sk_dst_cache = NULL;
 649                 atomic_set(&newsk->sk_rmem_alloc, 0);
 650                 skb_queue_head_init(&newsk->sk_receive_queue);
 651                 atomic_set(&newsk->sk_wmem_alloc, 0);
 652                 skb_queue_head_init(&newsk->sk_write_queue);
 653                 atomic_set(&newsk->sk_omem_alloc, 0);
 654                 newsk->sk_wmem_queued = 0;
 655                 newsk->sk_forward_alloc = 0;
 656
 657                 sock_reset_flag(newsk, SOCK_DONE);
 658                 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 659                 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
 660                 newsk->sk_send_head = NULL;
 661                 rwlock_init(&newsk->sk_callback_lock);
 662                 skb_queue_head_init(&newsk->sk_error_queue);
 663                 newsk->sk_write_space = sk_stream_write_space;
 664
 665                 if ((filter = newsk->sk_filter) != NULL)
 666                         sk_filter_charge(newsk, filter);
 667
 668                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
 669                         /* It is still raw copy of parent, so invalidate
 670                          * destructor and make plain sk_free() */
 671                         newsk->sk_destruct = NULL;
 672                         sk_free(newsk);
 673                         return NULL;
 674                 }
 675
 676                 /* Now setup tcp_sock */
 677                 newtp = tcp_sk(newsk);
 678                 newtp->pred_flags = 0;
 679                 newtp->rcv_nxt = treq->rcv_isn + 1;
 680                 newtp->snd_nxt = treq->snt_isn + 1;
 681                 newtp->snd_una = treq->snt_isn + 1;
 682                 newtp->snd_sml = treq->snt_isn + 1;
 683
 684                 tcp_prequeue_init(newtp);
 685
 686                 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
 687
 688                 newtp->retransmits = 0;
 689                 newtp->backoff = 0;
 690                 newtp->srtt = 0;
 691                 newtp->mdev = TCP_TIMEOUT_INIT;
 692                 newtp->rto = TCP_TIMEOUT_INIT;
 693
 694                 newtp->packets_out = 0;
 695                 newtp->left_out = 0;
 696                 newtp->retrans_out = 0;
 697                 newtp->sacked_out = 0;
 698                 newtp->fackets_out = 0;
 699                 newtp->snd_ssthresh = 0x7fffffff;
 700
 701                 /* So many TCP implementations out there (incorrectly) count the
 702                  * initial SYN frame in their delayed-ACK and congestion control
 703                  * algorithms that we must have the following bandaid to talk
 704                  * efficiently to them.  -DaveM
 705                  */
 706                 newtp->snd_cwnd = 2;
 707                 newtp->snd_cwnd_cnt = 0;
 708
 709                 newtp->frto_counter = 0;
 710                 newtp->frto_highmark = 0;
 711
 712                 newtp->ca_ops = &tcp_reno;
 713
 714                 tcp_set_ca_state(newtp, TCP_CA_Open);
 715                 tcp_init_xmit_timers(newsk);
 716                 skb_queue_head_init(&newtp->out_of_order_queue);
 717                 newtp->rcv_wup = treq->rcv_isn + 1;
 718                 newtp->write_seq = treq->snt_isn + 1;
 719                 newtp->pushed_seq = newtp->write_seq;
 720                 newtp->copied_seq = treq->rcv_isn + 1;
 721
 722                 newtp->rx_opt.saw_tstamp = 0;
 723
 724                 newtp->rx_opt.dsack = 0;
 725                 newtp->rx_opt.eff_sacks = 0;
 726
 727                 newtp->probes_out = 0;
 728                 newtp->rx_opt.num_sacks = 0;
 729                 newtp->urg_data = 0;
 730                 /* Deinitialize accept_queue to trap illegal accesses. */
 731                 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
 732
 733                 /* Back to base struct sock members. */
 734                 newsk->sk_err = 0;
 735                 newsk->sk_priority = 0;
 736                 atomic_set(&newsk->sk_refcnt, 2);
 737
 738                 /*
 739                  * Increment the counter in the same struct proto as the master
 740                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
 741                  * is the same as sk->sk_prot->socks, as this field was copied
 742                  * with memcpy), same rationale as the first comment in this
 743                  * function.
 744                  *
 745                  * This _changes_ the previous behaviour, where
 746                  * tcp_create_openreq_child always was incrementing the
 747                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
 748                  * to be taken into account in all callers. -acme
 749                  */
 750                 sk_refcnt_debug_inc(newsk);
 751
 752                 atomic_inc(&tcp_sockets_allocated);
 753
 754                 if (sock_flag(newsk, SOCK_KEEPOPEN))
 755                         tcp_reset_keepalive_timer(newsk,
 756                                                   keepalive_time_when(newtp));
 757                 newsk->sk_socket = NULL;
 758                 newsk->sk_sleep = NULL;
 759
 760                 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 761                 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
 762                         if (sysctl_tcp_fack)
 763                                 newtp->rx_opt.sack_ok |= 2;
 764                 }
 765                 newtp->window_clamp = req->window_clamp;
 766                 newtp->rcv_ssthresh = req->rcv_wnd;
 767                 newtp->rcv_wnd = req->rcv_wnd;
 768                 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 769                 if (newtp->rx_opt.wscale_ok) {
 770                         newtp->rx_opt.snd_wscale = ireq->snd_wscale;
 771                         newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
 772                 } else {
 773                         newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
 774                         newtp->window_clamp = min(newtp->window_clamp, 65535U);
 775                 }
 776                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
 777                 newtp->max_window = newtp->snd_wnd;
 778
 779                 if (newtp->rx_opt.tstamp_ok) {
 780                         newtp->rx_opt.ts_recent = req->ts_recent;
 781                         newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
 782                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 783                 } else {
 784                         newtp->rx_opt.ts_recent_stamp = 0;
 785                         newtp->tcp_header_len = sizeof(struct tcphdr);
 786                 }
 787                 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
 788                         newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
 789                 newtp->rx_opt.mss_clamp = req->mss;
 790                 TCP_ECN_openreq_child(newtp, req);
 791                 if (newtp->ecn_flags&TCP_ECN_OK)
 792                         sock_set_flag(newsk, SOCK_NO_LARGESEND);
 793
 794                 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
 795         }
 796         return newsk;
 797 }
 798
 799 /*
 800  *      Process an incoming packet for SYN_RECV sockets represented
 801  *      as a request_sock.
 802  */
 803
 804 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 805                            struct request_sock *req,
 806                            struct request_sock **prev)
 807 {
 808         struct tcphdr *th = skb->h.th;
 809         struct tcp_sock *tp = tcp_sk(sk);
 810         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 811         int paws_reject = 0;
 812         struct tcp_options_received tmp_opt;
 813         struct sock *child;
 814
 815         tmp_opt.saw_tstamp = 0;
 816         if (th->doff > (sizeof(struct tcphdr)>>2)) {
 817                 tcp_parse_options(skb, &tmp_opt, 0);
 818
 819                 if (tmp_opt.saw_tstamp) {
 820                         tmp_opt.ts_recent = req->ts_recent;
 821                         /* We do not store true stamp, but it is not required,
 822                          * it can be estimated (approximately)
 823                          * from another data.
 824                          */
 825                         tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 826                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 827                 }
 828         }
 829
 830         /* Check for pure retransmitted SYN. */
 831         if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 832             flg == TCP_FLAG_SYN &&
 833             !paws_reject) {
 834                 /*
 835                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 836                  * this case on figure 6 and figure 8, but formal
 837                  * protocol description says NOTHING.
 838                  * To be more exact, it says that we should send ACK,
 839                  * because this segment (at least, if it has no data)
 840                  * is out of window.
 841                  *
 842                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 843                  *  describe SYN-RECV state. All the description
 844                  *  is wrong, we cannot believe to it and should
 845                  *  rely only on common sense and implementation
 846                  *  experience.
 847                  *
 848                  * Enforce "SYN-ACK" according to figure 8, figure 6
 849                  * of RFC793, fixed by RFC1122.
 850                  */
 851                 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
 852                 return NULL;
 853         }
 854
 855         /* Further reproduces section "SEGMENT ARRIVES"
 856            for state SYN-RECEIVED of RFC793.
 857            It is broken, however, it does not work only
 858            when SYNs are crossed.
 859
 860            You would think that SYN crossing is impossible here, since
 861            we should have a SYN_SENT socket (from connect()) on our end,
 862            but this is not true if the crossed SYNs were sent to both
 863            ends by a malicious third party.  We must defend against this,
 864            and to do that we first verify the ACK (as per RFC793, page
 865            36) and reset if it is invalid.  Is this a true full defense?
 866            To convince ourselves, let us consider a way in which the ACK
 867            test can still pass in this 'malicious crossed SYNs' case.
 868            Malicious sender sends identical SYNs (and thus identical sequence
 869            numbers) to both A and B:
 870
 871                 A: gets SYN, seq=7
 872                 B: gets SYN, seq=7
 873
 874            By our good fortune, both A and B select the same initial
 875            send sequence number of seven :-)
 876
 877                 A: sends SYN|ACK, seq=7, ack_seq=8
 878                 B: sends SYN|ACK, seq=7, ack_seq=8
 879
 880            So we are now A eating this SYN|ACK, ACK test passes.  So
 881            does sequence test, SYN is truncated, and thus we consider
 882            it a bare ACK.
 883
 884            If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
 885            we create an established connection.  Both ends (listening sockets)
 886            accept the new incoming connection and try to talk to each other. 8-)
 887
 888            Note: This case is both harmless, and rare.  Possibility is about the
 889            same as us discovering intelligent life on another plant tomorrow.
 890
 891            But generally, we should (RFC lies!) to accept ACK
 892            from SYNACK both here and in tcp_rcv_state_process().
 893            tcp_rcv_state_process() does not, hence, we do not too.
 894
 895            Note that the case is absolutely generic:
 896            we cannot optimize anything here without
 897            violating protocol. All the checks must be made
 898            before attempt to create socket.
 899          */
 900
 901         /* RFC793 page 36: "If the connection is in any non-synchronized state ...
 902          *                  and the incoming segment acknowledges something not yet
 903          *                  sent (the segment carries an unaccaptable ACK) ...
 904          *                  a reset is sent."
 905          *
 906          * Invalid ACK: reset will be sent by listening socket
 907          */
 908         if ((flg & TCP_FLAG_ACK) &&
 909             (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
 910                 return sk;
 911
 912         /* Also, it would be not so bad idea to check rcv_tsecr, which
 913          * is essentially ACK extension and too early or too late values
 914          * should cause reset in unsynchronized states.
 915          */
 916
 917         /* RFC793: "first check sequence number". */
 918
 919         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 920                                           tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
 921                 /* Out of window: send ACK and drop. */
 922                 if (!(flg & TCP_FLAG_RST))
 923                         req->rsk_ops->send_ack(skb, req);
 924                 if (paws_reject)
 925                         NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 926                 return NULL;
 927         }
 928
 929         /* In sequence, PAWS is OK. */
 930
 931         if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
 932                         req->ts_recent = tmp_opt.rcv_tsval;
 933
 934                 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
 935                         /* Truncate SYN, it is out of window starting
 936                            at tcp_rsk(req)->rcv_isn + 1. */
 937                         flg &= ~TCP_FLAG_SYN;
 938                 }
 939
 940                 /* RFC793: "second check the RST bit" and
 941                  *         "fourth, check the SYN bit"
 942                  */
 943                 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
 944                         goto embryonic_reset;
 945
 946                 /* ACK sequence verified above, just make sure ACK is
 947                  * set.  If ACK not set, just silently drop the packet.
 948                  */
 949                 if (!(flg & TCP_FLAG_ACK))
 950                         return NULL;
 951
 952                 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
 953                 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 954                         inet_rsk(req)->acked = 1;
 955                         return NULL;
 956                 }
 957
 958                 /* OK, ACK is valid, create big socket and
 959                  * feed this segment to it. It will repeat all
 960                  * the tests. THIS SEGMENT MUST MOVE SOCKET TO
 961                  * ESTABLISHED STATE. If it will be dropped after
 962                  * socket is created, wait for troubles.
 963                  */
 964                 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
 965                 if (child == NULL)
 966                         goto listen_overflow;
 967
 968                 tcp_synq_unlink(tp, req, prev);
 969                 tcp_synq_removed(sk, req);
 970
 971                 tcp_acceptq_queue(sk, req, child);
 972                 return child;
 973
 974         listen_overflow:
 975                 if (!sysctl_tcp_abort_on_overflow) {
 976                         inet_rsk(req)->acked = 1;
 977                         return NULL;
 978                 }
 979
 980         embryonic_reset:
 981                 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
 982                 if (!(flg & TCP_FLAG_RST))
 983                         req->rsk_ops->send_reset(skb);
 984
 985                 tcp_synq_drop(sk, req, prev);
 986                 return NULL;
 987 }
 988
 989 /*
 990  * Queue segment on the new socket if the new socket is active,
 991  * otherwise we just shortcircuit this and continue with
 992  * the new socket.
 993  */
 994
 995 int tcp_child_process(struct sock *parent, struct sock *child,
 996                       struct sk_buff *skb)
 997 {
 998         int ret = 0;
 999         int state = child->sk_state;
1000
1001         if (!sock_owned_by_user(child)) {
1002                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1003
1004                 /* Wakeup parent, send SIGIO */
1005                 if (state == TCP_SYN_RECV && child->sk_state != state)
1006                         parent->sk_data_ready(parent, 0);
1007         } else {
1008                 /* Alas, it is possible again, because we do lookup
1009                  * in main socket hash table and lock on listening
1010                  * socket does not protect us more.
1011                  */
1012                 sk_add_backlog(child, skb);
1013         }
1014
1015         bh_unlock_sock(child);
1016         sock_put(child);
1017         return ret;
1018 }
1019
1020 EXPORT_SYMBOL(tcp_check_req);
1021 EXPORT_SYMBOL(tcp_child_process);
1022 EXPORT_SYMBOL(tcp_create_openreq_child);
1023 EXPORT_SYMBOL(tcp_timewait_state_process);
1024 EXPORT_SYMBOL(tcp_tw_deschedule);