]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/tcp_minisocks.c
[INET]: Generalise the TCP sock ID lookup routines
[karo-tx-linux.git] / net / ipv4 / tcp_minisocks.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  */
22
23 #include <linux/config.h>
24 #include <linux/mm.h>
25 #include <linux/module.h>
26 #include <linux/sysctl.h>
27 #include <linux/workqueue.h>
28 #include <net/tcp.h>
29 #include <net/inet_common.h>
30 #include <net/xfrm.h>
31
32 #ifdef CONFIG_SYSCTL
33 #define SYNC_INIT 0 /* let the user enable it */
34 #else
35 #define SYNC_INIT 1
36 #endif
37
38 int sysctl_tcp_tw_recycle;
39 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
40
41 int sysctl_tcp_syncookies = SYNC_INIT; 
42 int sysctl_tcp_abort_on_overflow;
43
44 static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo);
45
46 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
47 {
48         if (seq == s_win)
49                 return 1;
50         if (after(end_seq, s_win) && before(seq, e_win))
51                 return 1;
52         return (seq == e_win && seq == end_seq);
53 }
54
55 /* New-style handling of TIME_WAIT sockets. */
56
57 int tcp_tw_count;
58
59 /* 
60  * * Main purpose of TIME-WAIT state is to close connection gracefully,
61  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
62  *   (and, probably, tail of data) and one or more our ACKs are lost.
63  * * What is TIME-WAIT timeout? It is associated with maximal packet
64  *   lifetime in the internet, which results in wrong conclusion, that
65  *   it is set to catch "old duplicate segments" wandering out of their path.
66  *   It is not quite correct. This timeout is calculated so that it exceeds
67  *   maximal retransmission timeout enough to allow to lose one (or more)
68  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
69  * * When TIME-WAIT socket receives RST, it means that another end
70  *   finally closed and we are allowed to kill TIME-WAIT too.
71  * * Second purpose of TIME-WAIT is catching old duplicate segments.
72  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
73  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
74  * * If we invented some more clever way to catch duplicates
75  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
76  *
77  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
78  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
79  * from the very beginning.
80  *
81  * NOTE. With recycling (and later with fin-wait-2) TW bucket
82  * is _not_ stateless. It means, that strictly speaking we must
83  * spinlock it. I do not want! Well, probability of misbehaviour
84  * is ridiculously low and, seems, we could use some mb() tricks
85  * to avoid misread sequence numbers, states etc.  --ANK
86  */
87 enum tcp_tw_status
88 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
89                            const struct tcphdr *th)
90 {
91         struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
92         struct tcp_options_received tmp_opt;
93         int paws_reject = 0;
94
95         tmp_opt.saw_tstamp = 0;
96         if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
97                 tcp_parse_options(skb, &tmp_opt, 0);
98
99                 if (tmp_opt.saw_tstamp) {
100                         tmp_opt.ts_recent       = tcptw->tw_ts_recent;
101                         tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
102                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
103                 }
104         }
105
106         if (tw->tw_substate == TCP_FIN_WAIT2) {
107                 /* Just repeat all the checks of tcp_rcv_state_process() */
108
109                 /* Out of window, send ACK */
110                 if (paws_reject ||
111                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
112                                    tcptw->tw_rcv_nxt,
113                                    tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
114                         return TCP_TW_ACK;
115
116                 if (th->rst)
117                         goto kill;
118
119                 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
120                         goto kill_with_rst;
121
122                 /* Dup ACK? */
123                 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
124                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
125                         inet_twsk_put(tw);
126                         return TCP_TW_SUCCESS;
127                 }
128
129                 /* New data or FIN. If new data arrive after half-duplex close,
130                  * reset.
131                  */
132                 if (!th->fin ||
133                     TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
134 kill_with_rst:
135                         tcp_tw_deschedule(tw);
136                         inet_twsk_put(tw);
137                         return TCP_TW_RST;
138                 }
139
140                 /* FIN arrived, enter true time-wait state. */
141                 tw->tw_substate   = TCP_TIME_WAIT;
142                 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
143                 if (tmp_opt.saw_tstamp) {
144                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
145                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
146                 }
147
148                 /* I am shamed, but failed to make it more elegant.
149                  * Yes, it is direct reference to IP, which is impossible
150                  * to generalize to IPv6. Taking into account that IPv6
151                  * do not undertsnad recycling in any case, it not
152                  * a big problem in practice. --ANK */
153                 if (tw->tw_family == AF_INET &&
154                     sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp &&
155                     tcp_v4_tw_remember_stamp(tw))
156                         tcp_tw_schedule(tw, tw->tw_timeout);
157                 else
158                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
159                 return TCP_TW_ACK;
160         }
161
162         /*
163          *      Now real TIME-WAIT state.
164          *
165          *      RFC 1122:
166          *      "When a connection is [...] on TIME-WAIT state [...]
167          *      [a TCP] MAY accept a new SYN from the remote TCP to
168          *      reopen the connection directly, if it:
169          *      
170          *      (1)  assigns its initial sequence number for the new
171          *      connection to be larger than the largest sequence
172          *      number it used on the previous connection incarnation,
173          *      and
174          *
175          *      (2)  returns to TIME-WAIT state if the SYN turns out 
176          *      to be an old duplicate".
177          */
178
179         if (!paws_reject &&
180             (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
181              (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
182                 /* In window segment, it may be only reset or bare ack. */
183
184                 if (th->rst) {
185                         /* This is TIME_WAIT assasination, in two flavors.
186                          * Oh well... nobody has a sufficient solution to this
187                          * protocol bug yet.
188                          */
189                         if (sysctl_tcp_rfc1337 == 0) {
190 kill:
191                                 tcp_tw_deschedule(tw);
192                                 inet_twsk_put(tw);
193                                 return TCP_TW_SUCCESS;
194                         }
195                 }
196                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
197
198                 if (tmp_opt.saw_tstamp) {
199                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
200                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
201                 }
202
203                 inet_twsk_put(tw);
204                 return TCP_TW_SUCCESS;
205         }
206
207         /* Out of window segment.
208
209            All the segments are ACKed immediately.
210
211            The only exception is new SYN. We accept it, if it is
212            not old duplicate and we are not in danger to be killed
213            by delayed old duplicates. RFC check is that it has
214            newer sequence number works at rates <40Mbit/sec.
215            However, if paws works, it is reliable AND even more,
216            we even may relax silly seq space cutoff.
217
218            RED-PEN: we violate main RFC requirement, if this SYN will appear
219            old duplicate (i.e. we receive RST in reply to SYN-ACK),
220            we must return socket to time-wait state. It is not good,
221            but not fatal yet.
222          */
223
224         if (th->syn && !th->rst && !th->ack && !paws_reject &&
225             (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
226              (tmp_opt.saw_tstamp &&
227               (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
228                 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
229                 if (isn == 0)
230                         isn++;
231                 TCP_SKB_CB(skb)->when = isn;
232                 return TCP_TW_SYN;
233         }
234
235         if (paws_reject)
236                 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
237
238         if(!th->rst) {
239                 /* In this case we must reset the TIMEWAIT timer.
240                  *
241                  * If it is ACKless SYN it may be both old duplicate
242                  * and new good SYN with random sequence number <rcv_nxt.
243                  * Do not reschedule in the last case.
244                  */
245                 if (paws_reject || th->ack)
246                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
247
248                 /* Send ACK. Note, we do not put the bucket,
249                  * it will be released by caller.
250                  */
251                 return TCP_TW_ACK;
252         }
253         inet_twsk_put(tw);
254         return TCP_TW_SUCCESS;
255 }
256
257 /* 
258  * Move a socket to time-wait or dead fin-wait-2 state.
259  */ 
260 void tcp_time_wait(struct sock *sk, int state, int timeo)
261 {
262         struct inet_timewait_sock *tw = NULL;
263         const struct tcp_sock *tp = tcp_sk(sk);
264         int recycle_ok = 0;
265
266         if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
267                 recycle_ok = tp->af_specific->remember_stamp(sk);
268
269         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
270                 tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, SLAB_ATOMIC);
271
272         if (tw != NULL) {
273                 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
274                 const struct inet_sock *inet = inet_sk(sk);
275                 const int rto = (tp->rto << 2) - (tp->rto >> 1);
276
277                 /* Remember our protocol */
278                 tw->tw_prot             = sk->sk_prot_creator;
279
280                 /* Give us an identity. */
281                 tw->tw_daddr            = inet->daddr;
282                 tw->tw_rcv_saddr        = inet->rcv_saddr;
283                 tw->tw_bound_dev_if     = sk->sk_bound_dev_if;
284                 tw->tw_num              = inet->num;
285                 tw->tw_state            = TCP_TIME_WAIT;
286                 tw->tw_substate         = state;
287                 tw->tw_sport            = inet->sport;
288                 tw->tw_dport            = inet->dport;
289                 tw->tw_family           = sk->sk_family;
290                 tw->tw_reuse            = sk->sk_reuse;
291                 tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
292                 atomic_set(&tw->tw_refcnt, 1);
293
294                 tw->tw_hashent          = sk->sk_hashent;
295                 tcptw->tw_rcv_nxt       = tp->rcv_nxt;
296                 tcptw->tw_snd_nxt       = tp->snd_nxt;
297                 tcptw->tw_rcv_wnd       = tcp_receive_window(tp);
298                 tcptw->tw_ts_recent     = tp->rx_opt.ts_recent;
299                 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
300                 inet_twsk_dead_node_init(tw);
301
302 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
303                 if (tw->tw_family == PF_INET6) {
304                         struct ipv6_pinfo *np = inet6_sk(sk);
305                         struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
306
307                         ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
308                         ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
309                         tw->tw_ipv6only = np->ipv6only;
310                 } else
311                         tw->tw_ipv6only = 0;
312 #endif
313                 /* Linkage updates. */
314                 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
315
316                 /* Get the TIME_WAIT timeout firing. */
317                 if (timeo < rto)
318                         timeo = rto;
319
320                 if (recycle_ok) {
321                         tw->tw_timeout = rto;
322                 } else {
323                         tw->tw_timeout = TCP_TIMEWAIT_LEN;
324                         if (state == TCP_TIME_WAIT)
325                                 timeo = TCP_TIMEWAIT_LEN;
326                 }
327
328                 tcp_tw_schedule(tw, timeo);
329                 inet_twsk_put(tw);
330         } else {
331                 /* Sorry, if we're out of memory, just CLOSE this
332                  * socket up.  We've got bigger problems than
333                  * non-graceful socket closings.
334                  */
335                 if (net_ratelimit())
336                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
337         }
338
339         tcp_update_metrics(sk);
340         tcp_done(sk);
341 }
342
343 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
344 static int tcp_tw_death_row_slot;
345
346 static void tcp_twkill(unsigned long);
347
348 /* TIME_WAIT reaping mechanism. */
349 #define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
350 #define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
351
352 #define TCP_TWKILL_QUOTA        100
353
354 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
355 static DEFINE_SPINLOCK(tw_death_lock);
356 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
357 static void twkill_work(void *);
358 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
359 static u32 twkill_thread_slots;
360
361 /* Returns non-zero if quota exceeded.  */
362 static int tcp_do_twkill_work(int slot, unsigned int quota)
363 {
364         struct inet_timewait_sock *tw;
365         struct hlist_node *node;
366         unsigned int killed;
367         int ret;
368
369         /* NOTE: compare this to previous version where lock
370          * was released after detaching chain. It was racy,
371          * because tw buckets are scheduled in not serialized context
372          * in 2.3 (with netfilter), and with softnet it is common, because
373          * soft irqs are not sequenced.
374          */
375         killed = 0;
376         ret = 0;
377 rescan:
378         inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
379                 __inet_twsk_del_dead_node(tw);
380                 spin_unlock(&tw_death_lock);
381                 __inet_twsk_kill(tw, &tcp_hashinfo);
382                 inet_twsk_put(tw);
383                 killed++;
384                 spin_lock(&tw_death_lock);
385                 if (killed > quota) {
386                         ret = 1;
387                         break;
388                 }
389
390                 /* While we dropped tw_death_lock, another cpu may have
391                  * killed off the next TW bucket in the list, therefore
392                  * do a fresh re-read of the hlist head node with the
393                  * lock reacquired.  We still use the hlist traversal
394                  * macro in order to get the prefetches.
395                  */
396                 goto rescan;
397         }
398
399         tcp_tw_count -= killed;
400         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
401
402         return ret;
403 }
404
405 static void tcp_twkill(unsigned long dummy)
406 {
407         int need_timer, ret;
408
409         spin_lock(&tw_death_lock);
410
411         if (tcp_tw_count == 0)
412                 goto out;
413
414         need_timer = 0;
415         ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
416         if (ret) {
417                 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
418                 mb();
419                 schedule_work(&tcp_twkill_work);
420                 need_timer = 1;
421         } else {
422                 /* We purged the entire slot, anything left?  */
423                 if (tcp_tw_count)
424                         need_timer = 1;
425         }
426         tcp_tw_death_row_slot =
427                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
428         if (need_timer)
429                 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
430 out:
431         spin_unlock(&tw_death_lock);
432 }
433
434 extern void twkill_slots_invalid(void);
435
436 static void twkill_work(void *dummy)
437 {
438         int i;
439
440         if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
441                 twkill_slots_invalid();
442
443         while (twkill_thread_slots) {
444                 spin_lock_bh(&tw_death_lock);
445                 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
446                         if (!(twkill_thread_slots & (1 << i)))
447                                 continue;
448
449                         while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
450                                 if (need_resched()) {
451                                         spin_unlock_bh(&tw_death_lock);
452                                         schedule();
453                                         spin_lock_bh(&tw_death_lock);
454                                 }
455                         }
456
457                         twkill_thread_slots &= ~(1 << i);
458                 }
459                 spin_unlock_bh(&tw_death_lock);
460         }
461 }
462
463 /* These are always called from BH context.  See callers in
464  * tcp_input.c to verify this.
465  */
466
467 /* This is for handling early-kills of TIME_WAIT sockets. */
468 void tcp_tw_deschedule(struct inet_timewait_sock *tw)
469 {
470         spin_lock(&tw_death_lock);
471         if (inet_twsk_del_dead_node(tw)) {
472                 inet_twsk_put(tw);
473                 if (--tcp_tw_count == 0)
474                         del_timer(&tcp_tw_timer);
475         }
476         spin_unlock(&tw_death_lock);
477         __inet_twsk_kill(tw, &tcp_hashinfo);
478 }
479
480 /* Short-time timewait calendar */
481
482 static int tcp_twcal_hand = -1;
483 static int tcp_twcal_jiffie;
484 static void tcp_twcal_tick(unsigned long);
485 static struct timer_list tcp_twcal_timer =
486                 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
487 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
488
489 static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo)
490 {
491         struct hlist_head *list;
492         int slot;
493
494         /* timeout := RTO * 3.5
495          *
496          * 3.5 = 1+2+0.5 to wait for two retransmits.
497          *
498          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
499          * our ACK acking that FIN can be lost. If N subsequent retransmitted
500          * FINs (or previous seqments) are lost (probability of such event
501          * is p^(N+1), where p is probability to lose single packet and
502          * time to detect the loss is about RTO*(2^N - 1) with exponential
503          * backoff). Normal timewait length is calculated so, that we
504          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
505          * [ BTW Linux. following BSD, violates this requirement waiting
506          *   only for 60sec, we should wait at least for 240 secs.
507          *   Well, 240 consumes too much of resources 8)
508          * ]
509          * This interval is not reduced to catch old duplicate and
510          * responces to our wandering segments living for two MSLs.
511          * However, if we use PAWS to detect
512          * old duplicates, we can reduce the interval to bounds required
513          * by RTO, rather than MSL. So, if peer understands PAWS, we
514          * kill tw bucket after 3.5*RTO (it is important that this number
515          * is greater than TS tick!) and detect old duplicates with help
516          * of PAWS.
517          */
518         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
519
520         spin_lock(&tw_death_lock);
521
522         /* Unlink it, if it was scheduled */
523         if (inet_twsk_del_dead_node(tw))
524                 tcp_tw_count--;
525         else
526                 atomic_inc(&tw->tw_refcnt);
527
528         if (slot >= TCP_TW_RECYCLE_SLOTS) {
529                 /* Schedule to slow timer */
530                 if (timeo >= TCP_TIMEWAIT_LEN) {
531                         slot = TCP_TWKILL_SLOTS-1;
532                 } else {
533                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
534                         if (slot >= TCP_TWKILL_SLOTS)
535                                 slot = TCP_TWKILL_SLOTS-1;
536                 }
537                 tw->tw_ttd = jiffies + timeo;
538                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
539                 list = &tcp_tw_death_row[slot];
540         } else {
541                 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
542
543                 if (tcp_twcal_hand < 0) {
544                         tcp_twcal_hand = 0;
545                         tcp_twcal_jiffie = jiffies;
546                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
547                         add_timer(&tcp_twcal_timer);
548                 } else {
549                         if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
550                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
551                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
552                 }
553                 list = &tcp_twcal_row[slot];
554         }
555
556         hlist_add_head(&tw->tw_death_node, list);
557
558         if (tcp_tw_count++ == 0)
559                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
560         spin_unlock(&tw_death_lock);
561 }
562
563 void tcp_twcal_tick(unsigned long dummy)
564 {
565         int n, slot;
566         unsigned long j;
567         unsigned long now = jiffies;
568         int killed = 0;
569         int adv = 0;
570
571         spin_lock(&tw_death_lock);
572         if (tcp_twcal_hand < 0)
573                 goto out;
574
575         slot = tcp_twcal_hand;
576         j = tcp_twcal_jiffie;
577
578         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
579                 if (time_before_eq(j, now)) {
580                         struct hlist_node *node, *safe;
581                         struct inet_timewait_sock *tw;
582
583                         inet_twsk_for_each_inmate_safe(tw, node, safe,
584                                                        &tcp_twcal_row[slot]) {
585                                 __inet_twsk_del_dead_node(tw);
586                                 __inet_twsk_kill(tw, &tcp_hashinfo);
587                                 inet_twsk_put(tw);
588                                 killed++;
589                         }
590                 } else {
591                         if (!adv) {
592                                 adv = 1;
593                                 tcp_twcal_jiffie = j;
594                                 tcp_twcal_hand = slot;
595                         }
596
597                         if (!hlist_empty(&tcp_twcal_row[slot])) {
598                                 mod_timer(&tcp_twcal_timer, j);
599                                 goto out;
600                         }
601                 }
602                 j += (1<<TCP_TW_RECYCLE_TICK);
603                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
604         }
605         tcp_twcal_hand = -1;
606
607 out:
608         if ((tcp_tw_count -= killed) == 0)
609                 del_timer(&tcp_tw_timer);
610         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
611         spin_unlock(&tw_death_lock);
612 }
613
614 /* This is not only more efficient than what we used to do, it eliminates
615  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
616  *
617  * Actually, we could lots of memory writes here. tp of listening
618  * socket contains all necessary default parameters.
619  */
620 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
621 {
622         /* allocate the newsk from the same slab of the master sock,
623          * if not, at sk_free time we'll try to free it from the wrong
624          * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
625         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
626
627         if(newsk != NULL) {
628                 struct inet_request_sock *ireq = inet_rsk(req);
629                 struct tcp_request_sock *treq = tcp_rsk(req);
630                 struct inet_sock *newinet = inet_sk(newsk);
631                 struct tcp_sock *newtp;
632                 struct sk_filter *filter;
633
634                 memcpy(newsk, sk, sizeof(struct tcp_sock));
635                 newsk->sk_state = TCP_SYN_RECV;
636
637                 /* SANITY */
638                 sk_node_init(&newsk->sk_node);
639                 newinet->bind_hash = NULL;
640
641                 /* Clone the TCP header template */
642                 newinet->dport = ireq->rmt_port;
643
644                 sock_lock_init(newsk);
645                 bh_lock_sock(newsk);
646
647                 rwlock_init(&newsk->sk_dst_lock);
648                 newsk->sk_dst_cache = NULL;
649                 atomic_set(&newsk->sk_rmem_alloc, 0);
650                 skb_queue_head_init(&newsk->sk_receive_queue);
651                 atomic_set(&newsk->sk_wmem_alloc, 0);
652                 skb_queue_head_init(&newsk->sk_write_queue);
653                 atomic_set(&newsk->sk_omem_alloc, 0);
654                 newsk->sk_wmem_queued = 0;
655                 newsk->sk_forward_alloc = 0;
656
657                 sock_reset_flag(newsk, SOCK_DONE);
658                 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
659                 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
660                 newsk->sk_send_head = NULL;
661                 rwlock_init(&newsk->sk_callback_lock);
662                 skb_queue_head_init(&newsk->sk_error_queue);
663                 newsk->sk_write_space = sk_stream_write_space;
664
665                 if ((filter = newsk->sk_filter) != NULL)
666                         sk_filter_charge(newsk, filter);
667
668                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
669                         /* It is still raw copy of parent, so invalidate
670                          * destructor and make plain sk_free() */
671                         newsk->sk_destruct = NULL;
672                         sk_free(newsk);
673                         return NULL;
674                 }
675
676                 /* Now setup tcp_sock */
677                 newtp = tcp_sk(newsk);
678                 newtp->pred_flags = 0;
679                 newtp->rcv_nxt = treq->rcv_isn + 1;
680                 newtp->snd_nxt = treq->snt_isn + 1;
681                 newtp->snd_una = treq->snt_isn + 1;
682                 newtp->snd_sml = treq->snt_isn + 1;
683
684                 tcp_prequeue_init(newtp);
685
686                 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
687
688                 newtp->retransmits = 0;
689                 newtp->backoff = 0;
690                 newtp->srtt = 0;
691                 newtp->mdev = TCP_TIMEOUT_INIT;
692                 newtp->rto = TCP_TIMEOUT_INIT;
693
694                 newtp->packets_out = 0;
695                 newtp->left_out = 0;
696                 newtp->retrans_out = 0;
697                 newtp->sacked_out = 0;
698                 newtp->fackets_out = 0;
699                 newtp->snd_ssthresh = 0x7fffffff;
700
701                 /* So many TCP implementations out there (incorrectly) count the
702                  * initial SYN frame in their delayed-ACK and congestion control
703                  * algorithms that we must have the following bandaid to talk
704                  * efficiently to them.  -DaveM
705                  */
706                 newtp->snd_cwnd = 2;
707                 newtp->snd_cwnd_cnt = 0;
708
709                 newtp->frto_counter = 0;
710                 newtp->frto_highmark = 0;
711
712                 newtp->ca_ops = &tcp_reno;
713
714                 tcp_set_ca_state(newtp, TCP_CA_Open);
715                 tcp_init_xmit_timers(newsk);
716                 skb_queue_head_init(&newtp->out_of_order_queue);
717                 newtp->rcv_wup = treq->rcv_isn + 1;
718                 newtp->write_seq = treq->snt_isn + 1;
719                 newtp->pushed_seq = newtp->write_seq;
720                 newtp->copied_seq = treq->rcv_isn + 1;
721
722                 newtp->rx_opt.saw_tstamp = 0;
723
724                 newtp->rx_opt.dsack = 0;
725                 newtp->rx_opt.eff_sacks = 0;
726
727                 newtp->probes_out = 0;
728                 newtp->rx_opt.num_sacks = 0;
729                 newtp->urg_data = 0;
730                 /* Deinitialize accept_queue to trap illegal accesses. */
731                 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
732
733                 /* Back to base struct sock members. */
734                 newsk->sk_err = 0;
735                 newsk->sk_priority = 0;
736                 atomic_set(&newsk->sk_refcnt, 2);
737
738                 /*
739                  * Increment the counter in the same struct proto as the master
740                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
741                  * is the same as sk->sk_prot->socks, as this field was copied
742                  * with memcpy), same rationale as the first comment in this
743                  * function.
744                  *
745                  * This _changes_ the previous behaviour, where
746                  * tcp_create_openreq_child always was incrementing the
747                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
748                  * to be taken into account in all callers. -acme
749                  */
750                 sk_refcnt_debug_inc(newsk);
751
752                 atomic_inc(&tcp_sockets_allocated);
753
754                 if (sock_flag(newsk, SOCK_KEEPOPEN))
755                         tcp_reset_keepalive_timer(newsk,
756                                                   keepalive_time_when(newtp));
757                 newsk->sk_socket = NULL;
758                 newsk->sk_sleep = NULL;
759
760                 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
761                 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
762                         if (sysctl_tcp_fack)
763                                 newtp->rx_opt.sack_ok |= 2;
764                 }
765                 newtp->window_clamp = req->window_clamp;
766                 newtp->rcv_ssthresh = req->rcv_wnd;
767                 newtp->rcv_wnd = req->rcv_wnd;
768                 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
769                 if (newtp->rx_opt.wscale_ok) {
770                         newtp->rx_opt.snd_wscale = ireq->snd_wscale;
771                         newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
772                 } else {
773                         newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
774                         newtp->window_clamp = min(newtp->window_clamp, 65535U);
775                 }
776                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
777                 newtp->max_window = newtp->snd_wnd;
778
779                 if (newtp->rx_opt.tstamp_ok) {
780                         newtp->rx_opt.ts_recent = req->ts_recent;
781                         newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
782                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
783                 } else {
784                         newtp->rx_opt.ts_recent_stamp = 0;
785                         newtp->tcp_header_len = sizeof(struct tcphdr);
786                 }
787                 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
788                         newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
789                 newtp->rx_opt.mss_clamp = req->mss;
790                 TCP_ECN_openreq_child(newtp, req);
791                 if (newtp->ecn_flags&TCP_ECN_OK)
792                         sock_set_flag(newsk, SOCK_NO_LARGESEND);
793
794                 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
795         }
796         return newsk;
797 }
798
799 /* 
800  *      Process an incoming packet for SYN_RECV sockets represented
801  *      as a request_sock.
802  */
803
804 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
805                            struct request_sock *req,
806                            struct request_sock **prev)
807 {
808         struct tcphdr *th = skb->h.th;
809         struct tcp_sock *tp = tcp_sk(sk);
810         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
811         int paws_reject = 0;
812         struct tcp_options_received tmp_opt;
813         struct sock *child;
814
815         tmp_opt.saw_tstamp = 0;
816         if (th->doff > (sizeof(struct tcphdr)>>2)) {
817                 tcp_parse_options(skb, &tmp_opt, 0);
818
819                 if (tmp_opt.saw_tstamp) {
820                         tmp_opt.ts_recent = req->ts_recent;
821                         /* We do not store true stamp, but it is not required,
822                          * it can be estimated (approximately)
823                          * from another data.
824                          */
825                         tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
826                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
827                 }
828         }
829
830         /* Check for pure retransmitted SYN. */
831         if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
832             flg == TCP_FLAG_SYN &&
833             !paws_reject) {
834                 /*
835                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
836                  * this case on figure 6 and figure 8, but formal
837                  * protocol description says NOTHING.
838                  * To be more exact, it says that we should send ACK,
839                  * because this segment (at least, if it has no data)
840                  * is out of window.
841                  *
842                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
843                  *  describe SYN-RECV state. All the description
844                  *  is wrong, we cannot believe to it and should
845                  *  rely only on common sense and implementation
846                  *  experience.
847                  *
848                  * Enforce "SYN-ACK" according to figure 8, figure 6
849                  * of RFC793, fixed by RFC1122.
850                  */
851                 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
852                 return NULL;
853         }
854
855         /* Further reproduces section "SEGMENT ARRIVES"
856            for state SYN-RECEIVED of RFC793.
857            It is broken, however, it does not work only
858            when SYNs are crossed.
859
860            You would think that SYN crossing is impossible here, since
861            we should have a SYN_SENT socket (from connect()) on our end,
862            but this is not true if the crossed SYNs were sent to both
863            ends by a malicious third party.  We must defend against this,
864            and to do that we first verify the ACK (as per RFC793, page
865            36) and reset if it is invalid.  Is this a true full defense?
866            To convince ourselves, let us consider a way in which the ACK
867            test can still pass in this 'malicious crossed SYNs' case.
868            Malicious sender sends identical SYNs (and thus identical sequence
869            numbers) to both A and B:
870
871                 A: gets SYN, seq=7
872                 B: gets SYN, seq=7
873
874            By our good fortune, both A and B select the same initial
875            send sequence number of seven :-)
876
877                 A: sends SYN|ACK, seq=7, ack_seq=8
878                 B: sends SYN|ACK, seq=7, ack_seq=8
879
880            So we are now A eating this SYN|ACK, ACK test passes.  So
881            does sequence test, SYN is truncated, and thus we consider
882            it a bare ACK.
883
884            If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
885            we create an established connection.  Both ends (listening sockets)
886            accept the new incoming connection and try to talk to each other. 8-)
887
888            Note: This case is both harmless, and rare.  Possibility is about the
889            same as us discovering intelligent life on another plant tomorrow.
890
891            But generally, we should (RFC lies!) to accept ACK
892            from SYNACK both here and in tcp_rcv_state_process().
893            tcp_rcv_state_process() does not, hence, we do not too.
894
895            Note that the case is absolutely generic:
896            we cannot optimize anything here without
897            violating protocol. All the checks must be made
898            before attempt to create socket.
899          */
900
901         /* RFC793 page 36: "If the connection is in any non-synchronized state ...
902          *                  and the incoming segment acknowledges something not yet
903          *                  sent (the segment carries an unaccaptable ACK) ...
904          *                  a reset is sent."
905          *
906          * Invalid ACK: reset will be sent by listening socket
907          */
908         if ((flg & TCP_FLAG_ACK) &&
909             (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
910                 return sk;
911
912         /* Also, it would be not so bad idea to check rcv_tsecr, which
913          * is essentially ACK extension and too early or too late values
914          * should cause reset in unsynchronized states.
915          */
916
917         /* RFC793: "first check sequence number". */
918
919         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
920                                           tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
921                 /* Out of window: send ACK and drop. */
922                 if (!(flg & TCP_FLAG_RST))
923                         req->rsk_ops->send_ack(skb, req);
924                 if (paws_reject)
925                         NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
926                 return NULL;
927         }
928
929         /* In sequence, PAWS is OK. */
930
931         if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
932                         req->ts_recent = tmp_opt.rcv_tsval;
933
934                 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
935                         /* Truncate SYN, it is out of window starting
936                            at tcp_rsk(req)->rcv_isn + 1. */
937                         flg &= ~TCP_FLAG_SYN;
938                 }
939
940                 /* RFC793: "second check the RST bit" and
941                  *         "fourth, check the SYN bit"
942                  */
943                 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
944                         goto embryonic_reset;
945
946                 /* ACK sequence verified above, just make sure ACK is
947                  * set.  If ACK not set, just silently drop the packet.
948                  */
949                 if (!(flg & TCP_FLAG_ACK))
950                         return NULL;
951
952                 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
953                 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
954                         inet_rsk(req)->acked = 1;
955                         return NULL;
956                 }
957
958                 /* OK, ACK is valid, create big socket and
959                  * feed this segment to it. It will repeat all
960                  * the tests. THIS SEGMENT MUST MOVE SOCKET TO
961                  * ESTABLISHED STATE. If it will be dropped after
962                  * socket is created, wait for troubles.
963                  */
964                 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
965                 if (child == NULL)
966                         goto listen_overflow;
967
968                 tcp_synq_unlink(tp, req, prev);
969                 tcp_synq_removed(sk, req);
970
971                 tcp_acceptq_queue(sk, req, child);
972                 return child;
973
974         listen_overflow:
975                 if (!sysctl_tcp_abort_on_overflow) {
976                         inet_rsk(req)->acked = 1;
977                         return NULL;
978                 }
979
980         embryonic_reset:
981                 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
982                 if (!(flg & TCP_FLAG_RST))
983                         req->rsk_ops->send_reset(skb);
984
985                 tcp_synq_drop(sk, req, prev);
986                 return NULL;
987 }
988
989 /*
990  * Queue segment on the new socket if the new socket is active,
991  * otherwise we just shortcircuit this and continue with
992  * the new socket.
993  */
994
995 int tcp_child_process(struct sock *parent, struct sock *child,
996                       struct sk_buff *skb)
997 {
998         int ret = 0;
999         int state = child->sk_state;
1000
1001         if (!sock_owned_by_user(child)) {
1002                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
1003
1004                 /* Wakeup parent, send SIGIO */
1005                 if (state == TCP_SYN_RECV && child->sk_state != state)
1006                         parent->sk_data_ready(parent, 0);
1007         } else {
1008                 /* Alas, it is possible again, because we do lookup
1009                  * in main socket hash table and lock on listening
1010                  * socket does not protect us more.
1011                  */
1012                 sk_add_backlog(child, skb);
1013         }
1014
1015         bh_unlock_sock(child);
1016         sock_put(child);
1017         return ret;
1018 }
1019
1020 EXPORT_SYMBOL(tcp_check_req);
1021 EXPORT_SYMBOL(tcp_child_process);
1022 EXPORT_SYMBOL(tcp_create_openreq_child);
1023 EXPORT_SYMBOL(tcp_timewait_state_process);
1024 EXPORT_SYMBOL(tcp_tw_deschedule);