]> git.karo-electronics.de Git - mv-sheeva.git/blob - net/ipv4/netfilter/ip_conntrack_core.c
[NETFILTER]: Add ctnetlink subsystem
[mv-sheeva.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep;
74 static kmem_cache_t *ip_conntrack_expect_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache)
89 {
90         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
91                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
92                                     ecache->ct);
93         ecache->events = 0;
94 }
95
96 void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
97 {
98         __deliver_cached_events(ecache);
99 }
100
101 /* Deliver all cached events for a particular conntrack. This is called
102  * by code prior to async packet handling or freeing the skb */
103 void 
104 ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache = 
107                                         &__get_cpu_var(ip_conntrack_ecache);
108
109         if (!ct)
110                 return;
111
112         if (ecache->ct == ct) {
113                 DEBUGP("ecache: delivering event for %p\n", ct);
114                 __deliver_cached_events(ecache);
115         } else {
116                 if (net_ratelimit())
117                         printk(KERN_WARNING "ecache: want to deliver for %p, "
118                                 "but cache has %p\n", ct, ecache->ct);
119         }
120
121         /* signalize that events have already been delivered */
122         ecache->ct = NULL;
123 }
124
125 /* Deliver cached events for old pending events, if current conntrack != old */
126 void ip_conntrack_event_cache_init(const struct sk_buff *skb)
127 {
128         struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct;
129         struct ip_conntrack_ecache *ecache = 
130                                         &__get_cpu_var(ip_conntrack_ecache);
131
132         /* take care of delivering potentially old events */
133         if (ecache->ct != ct) {
134                 enum ip_conntrack_info ctinfo;
135                 /* we have to check, since at startup the cache is NULL */
136                 if (likely(ecache->ct)) {
137                         DEBUGP("ecache: entered for different conntrack: "
138                                "ecache->ct=%p, skb->nfct=%p. delivering "
139                                "events\n", ecache->ct, ct);
140                         __deliver_cached_events(ecache);
141                         ip_conntrack_put(ecache->ct);
142                 } else {
143                         DEBUGP("ecache: entered for conntrack %p, "
144                                 "cache was clean before\n", ct);
145                 }
146
147                 /* initialize for this conntrack/packet */
148                 ecache->ct = ip_conntrack_get(skb, &ctinfo);
149                 /* ecache->events cleared by __deliver_cached_devents() */
150         } else {
151                 DEBUGP("ecache: re-entered for conntrack %p.\n", ct);
152         }
153 }
154
155 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
156
157 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
158
159 static int ip_conntrack_hash_rnd_initted;
160 static unsigned int ip_conntrack_hash_rnd;
161
162 static u_int32_t
163 hash_conntrack(const struct ip_conntrack_tuple *tuple)
164 {
165 #if 0
166         dump_tuple(tuple);
167 #endif
168         return (jhash_3words(tuple->src.ip,
169                              (tuple->dst.ip ^ tuple->dst.protonum),
170                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
171                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
172 }
173
174 int
175 ip_ct_get_tuple(const struct iphdr *iph,
176                 const struct sk_buff *skb,
177                 unsigned int dataoff,
178                 struct ip_conntrack_tuple *tuple,
179                 const struct ip_conntrack_protocol *protocol)
180 {
181         /* Never happen */
182         if (iph->frag_off & htons(IP_OFFSET)) {
183                 printk("ip_conntrack_core: Frag of proto %u.\n",
184                        iph->protocol);
185                 return 0;
186         }
187
188         tuple->src.ip = iph->saddr;
189         tuple->dst.ip = iph->daddr;
190         tuple->dst.protonum = iph->protocol;
191         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
192
193         return protocol->pkt_to_tuple(skb, dataoff, tuple);
194 }
195
196 int
197 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
198                    const struct ip_conntrack_tuple *orig,
199                    const struct ip_conntrack_protocol *protocol)
200 {
201         inverse->src.ip = orig->dst.ip;
202         inverse->dst.ip = orig->src.ip;
203         inverse->dst.protonum = orig->dst.protonum;
204         inverse->dst.dir = !orig->dst.dir;
205
206         return protocol->invert_tuple(inverse, orig);
207 }
208
209
210 /* ip_conntrack_expect helper functions */
211 static void unlink_expect(struct ip_conntrack_expect *exp)
212 {
213         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
214         IP_NF_ASSERT(!timer_pending(&exp->timeout));
215         list_del(&exp->list);
216         CONNTRACK_STAT_INC(expect_delete);
217         exp->master->expecting--;
218 }
219
220 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
221 {
222         unlink_expect(exp);
223         ip_conntrack_expect_put(exp);
224 }
225
226 static void expectation_timed_out(unsigned long ul_expect)
227 {
228         struct ip_conntrack_expect *exp = (void *)ul_expect;
229
230         write_lock_bh(&ip_conntrack_lock);
231         unlink_expect(exp);
232         write_unlock_bh(&ip_conntrack_lock);
233         ip_conntrack_expect_put(exp);
234 }
235
236 struct ip_conntrack_expect *
237 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
238 {
239         struct ip_conntrack_expect *i;
240         
241         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
242                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
243                         atomic_inc(&i->use);
244                         return i;
245                 }
246         }
247         return NULL;
248 }
249
250 /* Just find a expectation corresponding to a tuple. */
251 struct ip_conntrack_expect *
252 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
253 {
254         struct ip_conntrack_expect *i;
255         
256         read_lock_bh(&ip_conntrack_lock);
257         i = __ip_conntrack_expect_find(tuple);
258         read_unlock_bh(&ip_conntrack_lock);
259
260         return i;
261 }
262
263 /* If an expectation for this connection is found, it gets delete from
264  * global list then returned. */
265 static struct ip_conntrack_expect *
266 find_expectation(const struct ip_conntrack_tuple *tuple)
267 {
268         struct ip_conntrack_expect *i;
269
270         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
271                 /* If master is not in hash table yet (ie. packet hasn't left
272                    this machine yet), how can other end know about expected?
273                    Hence these are not the droids you are looking for (if
274                    master ct never got confirmed, we'd hold a reference to it
275                    and weird things would happen to future packets). */
276                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
277                     && is_confirmed(i->master)
278                     && del_timer(&i->timeout)) {
279                         unlink_expect(i);
280                         return i;
281                 }
282         }
283         return NULL;
284 }
285
286 /* delete all expectations for this conntrack */
287 void ip_ct_remove_expectations(struct ip_conntrack *ct)
288 {
289         struct ip_conntrack_expect *i, *tmp;
290
291         /* Optimization: most connection never expect any others. */
292         if (ct->expecting == 0)
293                 return;
294
295         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
296                 if (i->master == ct && del_timer(&i->timeout)) {
297                         unlink_expect(i);
298                         ip_conntrack_expect_put(i);
299                 }
300         }
301 }
302
303 static void
304 clean_from_lists(struct ip_conntrack *ct)
305 {
306         unsigned int ho, hr;
307         
308         DEBUGP("clean_from_lists(%p)\n", ct);
309         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
310
311         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
312         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
313         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
314         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
315
316         /* Destroy all pending expectations */
317         ip_ct_remove_expectations(ct);
318 }
319
320 static void
321 destroy_conntrack(struct nf_conntrack *nfct)
322 {
323         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
324         struct ip_conntrack_protocol *proto;
325
326         DEBUGP("destroy_conntrack(%p)\n", ct);
327         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
328         IP_NF_ASSERT(!timer_pending(&ct->timeout));
329
330         set_bit(IPS_DYING_BIT, &ct->status);
331
332         /* To make sure we don't get any weird locking issues here:
333          * destroy_conntrack() MUST NOT be called with a write lock
334          * to ip_conntrack_lock!!! -HW */
335         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
336         if (proto && proto->destroy)
337                 proto->destroy(ct);
338
339         if (ip_conntrack_destroyed)
340                 ip_conntrack_destroyed(ct);
341
342         write_lock_bh(&ip_conntrack_lock);
343         /* Expectations will have been removed in clean_from_lists,
344          * except TFTP can create an expectation on the first packet,
345          * before connection is in the list, so we need to clean here,
346          * too. */
347         ip_ct_remove_expectations(ct);
348
349         /* We overload first tuple to link into unconfirmed list. */
350         if (!is_confirmed(ct)) {
351                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
352                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
353         }
354
355         CONNTRACK_STAT_INC(delete);
356         write_unlock_bh(&ip_conntrack_lock);
357
358         if (ct->master)
359                 ip_conntrack_put(ct->master);
360
361         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
362         ip_conntrack_free(ct);
363 }
364
365 static void death_by_timeout(unsigned long ul_conntrack)
366 {
367         struct ip_conntrack *ct = (void *)ul_conntrack;
368
369         ip_conntrack_event(IPCT_DESTROY, ct);
370         write_lock_bh(&ip_conntrack_lock);
371         /* Inside lock so preempt is disabled on module removal path.
372          * Otherwise we can get spurious warnings. */
373         CONNTRACK_STAT_INC(delete_list);
374         clean_from_lists(ct);
375         write_unlock_bh(&ip_conntrack_lock);
376         ip_conntrack_put(ct);
377 }
378
379 static inline int
380 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
381                     const struct ip_conntrack_tuple *tuple,
382                     const struct ip_conntrack *ignored_conntrack)
383 {
384         ASSERT_READ_LOCK(&ip_conntrack_lock);
385         return tuplehash_to_ctrack(i) != ignored_conntrack
386                 && ip_ct_tuple_equal(tuple, &i->tuple);
387 }
388
389 struct ip_conntrack_tuple_hash *
390 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
391                     const struct ip_conntrack *ignored_conntrack)
392 {
393         struct ip_conntrack_tuple_hash *h;
394         unsigned int hash = hash_conntrack(tuple);
395
396         ASSERT_READ_LOCK(&ip_conntrack_lock);
397         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
398                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
399                         CONNTRACK_STAT_INC(found);
400                         return h;
401                 }
402                 CONNTRACK_STAT_INC(searched);
403         }
404
405         return NULL;
406 }
407
408 /* Find a connection corresponding to a tuple. */
409 struct ip_conntrack_tuple_hash *
410 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
411                       const struct ip_conntrack *ignored_conntrack)
412 {
413         struct ip_conntrack_tuple_hash *h;
414
415         read_lock_bh(&ip_conntrack_lock);
416         h = __ip_conntrack_find(tuple, ignored_conntrack);
417         if (h)
418                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
419         read_unlock_bh(&ip_conntrack_lock);
420
421         return h;
422 }
423
424 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
425                                         unsigned int hash,
426                                         unsigned int repl_hash) 
427 {
428         ct->id = ++ip_conntrack_next_id;
429         list_prepend(&ip_conntrack_hash[hash],
430                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
431         list_prepend(&ip_conntrack_hash[repl_hash],
432                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
433 }
434
435 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
436 {
437         unsigned int hash, repl_hash;
438
439         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
440         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
441
442         write_lock_bh(&ip_conntrack_lock);
443         __ip_conntrack_hash_insert(ct, hash, repl_hash);
444         write_unlock_bh(&ip_conntrack_lock);
445 }
446
447 /* Confirm a connection given skb; places it in hash table */
448 int
449 __ip_conntrack_confirm(struct sk_buff **pskb)
450 {
451         unsigned int hash, repl_hash;
452         struct ip_conntrack *ct;
453         enum ip_conntrack_info ctinfo;
454
455         ct = ip_conntrack_get(*pskb, &ctinfo);
456
457         /* ipt_REJECT uses ip_conntrack_attach to attach related
458            ICMP/TCP RST packets in other direction.  Actual packet
459            which created connection will be IP_CT_NEW or for an
460            expected connection, IP_CT_RELATED. */
461         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
462                 return NF_ACCEPT;
463
464         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
465         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
466
467         /* We're not in hash table, and we refuse to set up related
468            connections for unconfirmed conns.  But packet copies and
469            REJECT will give spurious warnings here. */
470         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
471
472         /* No external references means noone else could have
473            confirmed us. */
474         IP_NF_ASSERT(!is_confirmed(ct));
475         DEBUGP("Confirming conntrack %p\n", ct);
476
477         write_lock_bh(&ip_conntrack_lock);
478
479         /* See if there's one in the list already, including reverse:
480            NAT could have grabbed it without realizing, since we're
481            not in the hash.  If there is, we lost race. */
482         if (!LIST_FIND(&ip_conntrack_hash[hash],
483                        conntrack_tuple_cmp,
484                        struct ip_conntrack_tuple_hash *,
485                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
486             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
487                           conntrack_tuple_cmp,
488                           struct ip_conntrack_tuple_hash *,
489                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
490                 /* Remove from unconfirmed list */
491                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
492
493                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
494                 /* Timer relative to confirmation time, not original
495                    setting time, otherwise we'd get timer wrap in
496                    weird delay cases. */
497                 ct->timeout.expires += jiffies;
498                 add_timer(&ct->timeout);
499                 atomic_inc(&ct->ct_general.use);
500                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
501                 CONNTRACK_STAT_INC(insert);
502                 write_unlock_bh(&ip_conntrack_lock);
503                 if (ct->helper)
504                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
505 #ifdef CONFIG_IP_NF_NAT_NEEDED
506                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
507                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
508                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
509 #endif
510                 ip_conntrack_event_cache(master_ct(ct) ?
511                                          IPCT_RELATED : IPCT_NEW, *pskb);
512
513                 return NF_ACCEPT;
514         }
515
516         CONNTRACK_STAT_INC(insert_failed);
517         write_unlock_bh(&ip_conntrack_lock);
518
519         return NF_DROP;
520 }
521
522 /* Returns true if a connection correspondings to the tuple (required
523    for NAT). */
524 int
525 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
526                          const struct ip_conntrack *ignored_conntrack)
527 {
528         struct ip_conntrack_tuple_hash *h;
529
530         read_lock_bh(&ip_conntrack_lock);
531         h = __ip_conntrack_find(tuple, ignored_conntrack);
532         read_unlock_bh(&ip_conntrack_lock);
533
534         return h != NULL;
535 }
536
537 /* There's a small race here where we may free a just-assured
538    connection.  Too bad: we're in trouble anyway. */
539 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
540 {
541         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
542 }
543
544 static int early_drop(struct list_head *chain)
545 {
546         /* Traverse backwards: gives us oldest, which is roughly LRU */
547         struct ip_conntrack_tuple_hash *h;
548         struct ip_conntrack *ct = NULL;
549         int dropped = 0;
550
551         read_lock_bh(&ip_conntrack_lock);
552         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
553         if (h) {
554                 ct = tuplehash_to_ctrack(h);
555                 atomic_inc(&ct->ct_general.use);
556         }
557         read_unlock_bh(&ip_conntrack_lock);
558
559         if (!ct)
560                 return dropped;
561
562         if (del_timer(&ct->timeout)) {
563                 death_by_timeout((unsigned long)ct);
564                 dropped = 1;
565                 CONNTRACK_STAT_INC(early_drop);
566         }
567         ip_conntrack_put(ct);
568         return dropped;
569 }
570
571 static inline int helper_cmp(const struct ip_conntrack_helper *i,
572                              const struct ip_conntrack_tuple *rtuple)
573 {
574         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
575 }
576
577 static struct ip_conntrack_helper *
578 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
579 {
580         return LIST_FIND(&helpers, helper_cmp,
581                          struct ip_conntrack_helper *,
582                          tuple);
583 }
584
585 struct ip_conntrack_helper *
586 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
587 {
588         struct ip_conntrack_helper *helper;
589
590         /* need ip_conntrack_lock to assure that helper exists until
591          * try_module_get() is called */
592         read_lock_bh(&ip_conntrack_lock);
593
594         helper = __ip_conntrack_helper_find(tuple);
595         if (helper) {
596                 /* need to increase module usage count to assure helper will
597                  * not go away while the caller is e.g. busy putting a
598                  * conntrack in the hash that uses the helper */
599                 if (!try_module_get(helper->me))
600                         helper = NULL;
601         }
602
603         read_unlock_bh(&ip_conntrack_lock);
604
605         return helper;
606 }
607
608 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
609 {
610         module_put(helper->me);
611 }
612
613 struct ip_conntrack_protocol *
614 __ip_conntrack_proto_find(u_int8_t protocol)
615 {
616         return ip_ct_protos[protocol];
617 }
618
619 /* this is guaranteed to always return a valid protocol helper, since
620  * it falls back to generic_protocol */
621 struct ip_conntrack_protocol *
622 ip_conntrack_proto_find_get(u_int8_t protocol)
623 {
624         struct ip_conntrack_protocol *p;
625
626         preempt_disable();
627         p = __ip_conntrack_proto_find(protocol);
628         if (p) {
629                 if (!try_module_get(p->me))
630                         p = &ip_conntrack_generic_protocol;
631         }
632         preempt_enable();
633         
634         return p;
635 }
636
637 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
638 {
639         module_put(p->me);
640 }
641
642 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
643                                         struct ip_conntrack_tuple *repl)
644 {
645         struct ip_conntrack *conntrack;
646
647         if (!ip_conntrack_hash_rnd_initted) {
648                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
649                 ip_conntrack_hash_rnd_initted = 1;
650         }
651
652         if (ip_conntrack_max
653             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
654                 unsigned int hash = hash_conntrack(orig);
655                 /* Try dropping from this hash chain. */
656                 if (!early_drop(&ip_conntrack_hash[hash])) {
657                         if (net_ratelimit())
658                                 printk(KERN_WARNING
659                                        "ip_conntrack: table full, dropping"
660                                        " packet.\n");
661                         return ERR_PTR(-ENOMEM);
662                 }
663         }
664
665         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
666         if (!conntrack) {
667                 DEBUGP("Can't allocate conntrack.\n");
668                 return NULL;
669         }
670
671         memset(conntrack, 0, sizeof(*conntrack));
672         atomic_set(&conntrack->ct_general.use, 1);
673         conntrack->ct_general.destroy = destroy_conntrack;
674         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
675         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
676         /* Don't set timer yet: wait for confirmation */
677         init_timer(&conntrack->timeout);
678         conntrack->timeout.data = (unsigned long)conntrack;
679         conntrack->timeout.function = death_by_timeout;
680
681         atomic_inc(&ip_conntrack_count);
682
683         return conntrack;
684 }
685
686 void
687 ip_conntrack_free(struct ip_conntrack *conntrack)
688 {
689         atomic_dec(&ip_conntrack_count);
690         kmem_cache_free(ip_conntrack_cachep, conntrack);
691 }
692
693 /* Allocate a new conntrack: we return -ENOMEM if classification
694  * failed due to stress.   Otherwise it really is unclassifiable */
695 static struct ip_conntrack_tuple_hash *
696 init_conntrack(struct ip_conntrack_tuple *tuple,
697                struct ip_conntrack_protocol *protocol,
698                struct sk_buff *skb)
699 {
700         struct ip_conntrack *conntrack;
701         struct ip_conntrack_tuple repl_tuple;
702         struct ip_conntrack_expect *exp;
703
704         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
705                 DEBUGP("Can't invert tuple.\n");
706                 return NULL;
707         }
708
709         if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple)))
710                 return NULL;
711
712         if (!protocol->new(conntrack, skb)) {
713                 ip_conntrack_free(conntrack);
714                 return NULL;
715         }
716
717         write_lock_bh(&ip_conntrack_lock);
718         exp = find_expectation(tuple);
719
720         if (exp) {
721                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
722                         conntrack, exp);
723                 /* Welcome, Mr. Bond.  We've been expecting you... */
724                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
725                 conntrack->master = exp->master;
726 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
727                 conntrack->mark = exp->master->mark;
728 #endif
729 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
730     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
731                 /* this is ugly, but there is no other place where to put it */
732                 conntrack->nat.masq_index = exp->master->nat.masq_index;
733 #endif
734                 nf_conntrack_get(&conntrack->master->ct_general);
735                 CONNTRACK_STAT_INC(expect_new);
736         } else {
737                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
738
739                 CONNTRACK_STAT_INC(new);
740         }
741
742         /* Overload tuple linked list to put us in unconfirmed list. */
743         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
744
745         write_unlock_bh(&ip_conntrack_lock);
746
747         if (exp) {
748                 if (exp->expectfn)
749                         exp->expectfn(conntrack, exp);
750                 ip_conntrack_expect_put(exp);
751         }
752
753         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
754 }
755
756 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
757 static inline struct ip_conntrack *
758 resolve_normal_ct(struct sk_buff *skb,
759                   struct ip_conntrack_protocol *proto,
760                   int *set_reply,
761                   unsigned int hooknum,
762                   enum ip_conntrack_info *ctinfo)
763 {
764         struct ip_conntrack_tuple tuple;
765         struct ip_conntrack_tuple_hash *h;
766         struct ip_conntrack *ct;
767
768         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
769
770         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
771                                 &tuple,proto))
772                 return NULL;
773
774         /* look for tuple match */
775         h = ip_conntrack_find_get(&tuple, NULL);
776         if (!h) {
777                 h = init_conntrack(&tuple, proto, skb);
778                 if (!h)
779                         return NULL;
780                 if (IS_ERR(h))
781                         return (void *)h;
782         }
783         ct = tuplehash_to_ctrack(h);
784
785         /* It exists; we have (non-exclusive) reference. */
786         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
787                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
788                 /* Please set reply bit if this packet OK */
789                 *set_reply = 1;
790         } else {
791                 /* Once we've had two way comms, always ESTABLISHED. */
792                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
793                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
794                                ct);
795                         *ctinfo = IP_CT_ESTABLISHED;
796                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
797                         DEBUGP("ip_conntrack_in: related packet for %p\n",
798                                ct);
799                         *ctinfo = IP_CT_RELATED;
800                 } else {
801                         DEBUGP("ip_conntrack_in: new packet for %p\n",
802                                ct);
803                         *ctinfo = IP_CT_NEW;
804                 }
805                 *set_reply = 0;
806         }
807         skb->nfct = &ct->ct_general;
808         skb->nfctinfo = *ctinfo;
809         return ct;
810 }
811
812 /* Netfilter hook itself. */
813 unsigned int ip_conntrack_in(unsigned int hooknum,
814                              struct sk_buff **pskb,
815                              const struct net_device *in,
816                              const struct net_device *out,
817                              int (*okfn)(struct sk_buff *))
818 {
819         struct ip_conntrack *ct;
820         enum ip_conntrack_info ctinfo;
821         struct ip_conntrack_protocol *proto;
822         int set_reply = 0;
823         int ret;
824
825         /* Previously seen (loopback or untracked)?  Ignore. */
826         if ((*pskb)->nfct) {
827                 CONNTRACK_STAT_INC(ignore);
828                 return NF_ACCEPT;
829         }
830
831         /* Never happen */
832         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
833                 if (net_ratelimit()) {
834                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
835                        (*pskb)->nh.iph->protocol, hooknum);
836                 }
837                 return NF_DROP;
838         }
839
840 /* Doesn't cover locally-generated broadcast, so not worth it. */
841 #if 0
842         /* Ignore broadcast: no `connection'. */
843         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
844                 printk("Broadcast packet!\n");
845                 return NF_ACCEPT;
846         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
847                    == htonl(0x000000FF)) {
848                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
849                        NIPQUAD((*pskb)->nh.iph->saddr),
850                        NIPQUAD((*pskb)->nh.iph->daddr),
851                        (*pskb)->sk, (*pskb)->pkt_type);
852         }
853 #endif
854
855         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
856
857         /* It may be an special packet, error, unclean...
858          * inverse of the return code tells to the netfilter
859          * core what to do with the packet. */
860         if (proto->error != NULL 
861             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
862                 CONNTRACK_STAT_INC(error);
863                 CONNTRACK_STAT_INC(invalid);
864                 return -ret;
865         }
866
867         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
868                 /* Not valid part of a connection */
869                 CONNTRACK_STAT_INC(invalid);
870                 return NF_ACCEPT;
871         }
872
873         if (IS_ERR(ct)) {
874                 /* Too stressed to deal. */
875                 CONNTRACK_STAT_INC(drop);
876                 return NF_DROP;
877         }
878
879         IP_NF_ASSERT((*pskb)->nfct);
880
881         ip_conntrack_event_cache_init(*pskb);
882
883         ret = proto->packet(ct, *pskb, ctinfo);
884         if (ret < 0) {
885                 /* Invalid: inverse of the return code tells
886                  * the netfilter core what to do*/
887                 nf_conntrack_put((*pskb)->nfct);
888                 (*pskb)->nfct = NULL;
889                 CONNTRACK_STAT_INC(invalid);
890                 return -ret;
891         }
892
893         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
894                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
895
896         return ret;
897 }
898
899 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
900                    const struct ip_conntrack_tuple *orig)
901 {
902         return ip_ct_invert_tuple(inverse, orig, 
903                                   __ip_conntrack_proto_find(orig->dst.protonum));
904 }
905
906 /* Would two expected things clash? */
907 static inline int expect_clash(const struct ip_conntrack_expect *a,
908                                const struct ip_conntrack_expect *b)
909 {
910         /* Part covered by intersection of masks must be unequal,
911            otherwise they clash */
912         struct ip_conntrack_tuple intersect_mask
913                 = { { a->mask.src.ip & b->mask.src.ip,
914                       { a->mask.src.u.all & b->mask.src.u.all } },
915                     { a->mask.dst.ip & b->mask.dst.ip,
916                       { a->mask.dst.u.all & b->mask.dst.u.all },
917                       a->mask.dst.protonum & b->mask.dst.protonum } };
918
919         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
920 }
921
922 static inline int expect_matches(const struct ip_conntrack_expect *a,
923                                  const struct ip_conntrack_expect *b)
924 {
925         return a->master == b->master
926                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
927                 && ip_ct_tuple_equal(&a->mask, &b->mask);
928 }
929
930 /* Generally a bad idea to call this: could have matched already. */
931 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
932 {
933         struct ip_conntrack_expect *i;
934
935         write_lock_bh(&ip_conntrack_lock);
936         /* choose the the oldest expectation to evict */
937         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
938                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
939                         unlink_expect(i);
940                         write_unlock_bh(&ip_conntrack_lock);
941                         ip_conntrack_expect_put(i);
942                         return;
943                 }
944         }
945         write_unlock_bh(&ip_conntrack_lock);
946 }
947
948 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
949 {
950         struct ip_conntrack_expect *new;
951
952         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
953         if (!new) {
954                 DEBUGP("expect_related: OOM allocating expect\n");
955                 return NULL;
956         }
957         new->master = me;
958         atomic_inc(&new->master->ct_general.use);
959         atomic_set(&new->use, 1);
960         return new;
961 }
962
963 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
964 {
965         if (atomic_dec_and_test(&exp->use)) {
966                 ip_conntrack_put(exp->master);
967                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
968         }
969 }
970
971 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
972 {
973         atomic_inc(&exp->use);
974         exp->master->expecting++;
975         list_add(&exp->list, &ip_conntrack_expect_list);
976
977         init_timer(&exp->timeout);
978         exp->timeout.data = (unsigned long)exp;
979         exp->timeout.function = expectation_timed_out;
980         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
981         add_timer(&exp->timeout);
982
983         exp->id = ++ip_conntrack_expect_next_id;
984         atomic_inc(&exp->use);
985         CONNTRACK_STAT_INC(expect_create);
986 }
987
988 /* Race with expectations being used means we could have none to find; OK. */
989 static void evict_oldest_expect(struct ip_conntrack *master)
990 {
991         struct ip_conntrack_expect *i;
992
993         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
994                 if (i->master == master) {
995                         if (del_timer(&i->timeout)) {
996                                 unlink_expect(i);
997                                 ip_conntrack_expect_put(i);
998                         }
999                         break;
1000                 }
1001         }
1002 }
1003
1004 static inline int refresh_timer(struct ip_conntrack_expect *i)
1005 {
1006         if (!del_timer(&i->timeout))
1007                 return 0;
1008
1009         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1010         add_timer(&i->timeout);
1011         return 1;
1012 }
1013
1014 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1015 {
1016         struct ip_conntrack_expect *i;
1017         int ret;
1018
1019         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1020         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1021         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1022
1023         write_lock_bh(&ip_conntrack_lock);
1024         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1025                 if (expect_matches(i, expect)) {
1026                         /* Refresh timer: if it's dying, ignore.. */
1027                         if (refresh_timer(i)) {
1028                                 ret = 0;
1029                                 goto out;
1030                         }
1031                 } else if (expect_clash(i, expect)) {
1032                         ret = -EBUSY;
1033                         goto out;
1034                 }
1035         }
1036
1037         /* Will be over limit? */
1038         if (expect->master->helper->max_expected && 
1039             expect->master->expecting >= expect->master->helper->max_expected)
1040                 evict_oldest_expect(expect->master);
1041
1042         ip_conntrack_expect_insert(expect);
1043         ip_conntrack_expect_event(IPEXP_NEW, expect);
1044         ret = 0;
1045 out:
1046         write_unlock_bh(&ip_conntrack_lock);
1047         return ret;
1048 }
1049
1050 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1051    implicitly racy: see __ip_conntrack_confirm */
1052 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1053                               const struct ip_conntrack_tuple *newreply)
1054 {
1055         write_lock_bh(&ip_conntrack_lock);
1056         /* Should be unconfirmed, so not in hash table yet */
1057         IP_NF_ASSERT(!is_confirmed(conntrack));
1058
1059         DEBUGP("Altering reply tuple of %p to ", conntrack);
1060         DUMP_TUPLE(newreply);
1061
1062         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1063         if (!conntrack->master && conntrack->expecting == 0)
1064                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1065         write_unlock_bh(&ip_conntrack_lock);
1066 }
1067
1068 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1069 {
1070         BUG_ON(me->timeout == 0);
1071         write_lock_bh(&ip_conntrack_lock);
1072         list_prepend(&helpers, me);
1073         write_unlock_bh(&ip_conntrack_lock);
1074
1075         return 0;
1076 }
1077
1078 struct ip_conntrack_helper *
1079 __ip_conntrack_helper_find_byname(const char *name)
1080 {
1081         struct ip_conntrack_helper *h;
1082
1083         list_for_each_entry(h, &helpers, list) {
1084                 if (!strcmp(h->name, name))
1085                         return h;
1086         }
1087
1088         return NULL;
1089 }
1090
1091 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1092                          const struct ip_conntrack_helper *me)
1093 {
1094         if (tuplehash_to_ctrack(i)->helper == me) {
1095                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1096                 tuplehash_to_ctrack(i)->helper = NULL;
1097         }
1098         return 0;
1099 }
1100
1101 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1102 {
1103         unsigned int i;
1104         struct ip_conntrack_expect *exp, *tmp;
1105
1106         /* Need write lock here, to delete helper. */
1107         write_lock_bh(&ip_conntrack_lock);
1108         LIST_DELETE(&helpers, me);
1109
1110         /* Get rid of expectations */
1111         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1112                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1113                         unlink_expect(exp);
1114                         ip_conntrack_expect_put(exp);
1115                 }
1116         }
1117         /* Get rid of expecteds, set helpers to NULL. */
1118         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1119         for (i = 0; i < ip_conntrack_htable_size; i++)
1120                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1121                             struct ip_conntrack_tuple_hash *, me);
1122         write_unlock_bh(&ip_conntrack_lock);
1123
1124         /* Someone could be still looking at the helper in a bh. */
1125         synchronize_net();
1126 }
1127
1128 static inline void ct_add_counters(struct ip_conntrack *ct,
1129                                    enum ip_conntrack_info ctinfo,
1130                                    const struct sk_buff *skb)
1131 {
1132 #ifdef CONFIG_IP_NF_CT_ACCT
1133         if (skb) {
1134                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1135                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1136                                         ntohs(skb->nh.iph->tot_len);
1137         }
1138 #endif
1139 }
1140
1141 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1142 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1143                         enum ip_conntrack_info ctinfo,
1144                         const struct sk_buff *skb,
1145                         unsigned long extra_jiffies)
1146 {
1147         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1148
1149         /* If not in hash table, timer will not be active yet */
1150         if (!is_confirmed(ct)) {
1151                 ct->timeout.expires = extra_jiffies;
1152                 ct_add_counters(ct, ctinfo, skb);
1153         } else {
1154                 write_lock_bh(&ip_conntrack_lock);
1155                 /* Need del_timer for race avoidance (may already be dying). */
1156                 if (del_timer(&ct->timeout)) {
1157                         ct->timeout.expires = jiffies + extra_jiffies;
1158                         add_timer(&ct->timeout);
1159                         ip_conntrack_event_cache(IPCT_REFRESH, skb);
1160                 }
1161                 ct_add_counters(ct, ctinfo, skb);
1162                 write_unlock_bh(&ip_conntrack_lock);
1163         }
1164 }
1165
1166 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1167     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1168 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1169  * in ip_conntrack_core, since we don't want the protocols to autoload
1170  * or depend on ctnetlink */
1171 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1172                                const struct ip_conntrack_tuple *tuple)
1173 {
1174         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1175                 &tuple->src.u.tcp.port);
1176         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1177                 &tuple->dst.u.tcp.port);
1178         return 0;
1179
1180 nfattr_failure:
1181         return -1;
1182 }
1183
1184 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1185                                struct ip_conntrack_tuple *t)
1186 {
1187         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1188                 return -EINVAL;
1189
1190         t->src.u.tcp.port =
1191                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1192         t->dst.u.tcp.port =
1193                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1194
1195         return 0;
1196 }
1197 #endif
1198
1199 /* Returns new sk_buff, or NULL */
1200 struct sk_buff *
1201 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1202 {
1203         skb_orphan(skb);
1204
1205         local_bh_disable(); 
1206         skb = ip_defrag(skb, user);
1207         local_bh_enable();
1208
1209         if (skb)
1210                 ip_send_check(skb->nh.iph);
1211         return skb;
1212 }
1213
1214 /* Used by ipt_REJECT. */
1215 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1216 {
1217         struct ip_conntrack *ct;
1218         enum ip_conntrack_info ctinfo;
1219
1220         /* This ICMP is in reverse direction to the packet which caused it */
1221         ct = ip_conntrack_get(skb, &ctinfo);
1222         
1223         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1224                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1225         else
1226                 ctinfo = IP_CT_RELATED;
1227
1228         /* Attach to new skbuff, and increment count */
1229         nskb->nfct = &ct->ct_general;
1230         nskb->nfctinfo = ctinfo;
1231         nf_conntrack_get(nskb->nfct);
1232 }
1233
1234 static inline int
1235 do_iter(const struct ip_conntrack_tuple_hash *i,
1236         int (*iter)(struct ip_conntrack *i, void *data),
1237         void *data)
1238 {
1239         return iter(tuplehash_to_ctrack(i), data);
1240 }
1241
1242 /* Bring out ya dead! */
1243 static struct ip_conntrack_tuple_hash *
1244 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1245                 void *data, unsigned int *bucket)
1246 {
1247         struct ip_conntrack_tuple_hash *h = NULL;
1248
1249         write_lock_bh(&ip_conntrack_lock);
1250         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1251                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1252                                 struct ip_conntrack_tuple_hash *, iter, data);
1253                 if (h)
1254                         break;
1255         }
1256         if (!h)
1257                 h = LIST_FIND_W(&unconfirmed, do_iter,
1258                                 struct ip_conntrack_tuple_hash *, iter, data);
1259         if (h)
1260                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1261         write_unlock_bh(&ip_conntrack_lock);
1262
1263         return h;
1264 }
1265
1266 void
1267 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1268 {
1269         struct ip_conntrack_tuple_hash *h;
1270         unsigned int bucket = 0;
1271
1272         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1273                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1274                 /* Time to push up daises... */
1275                 if (del_timer(&ct->timeout))
1276                         death_by_timeout((unsigned long)ct);
1277                 /* ... else the timer will get him soon. */
1278
1279                 ip_conntrack_put(ct);
1280         }
1281
1282 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1283         {
1284                 /* we need to deliver all cached events in order to drop
1285                  * the reference counts */
1286                 int cpu;
1287                 for_each_cpu(cpu) {
1288                         struct ip_conntrack_ecache *ecache = 
1289                                         &per_cpu(ip_conntrack_ecache, cpu);
1290                         if (ecache->ct) {
1291                                 __ip_ct_deliver_cached_events(ecache);
1292                                 ip_conntrack_put(ecache->ct);
1293                                 ecache->ct = NULL;
1294                         }
1295                 }
1296         }
1297 #endif
1298 }
1299
1300 /* Fast function for those who don't want to parse /proc (and I don't
1301    blame them). */
1302 /* Reversing the socket's dst/src point of view gives us the reply
1303    mapping. */
1304 static int
1305 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1306 {
1307         struct inet_sock *inet = inet_sk(sk);
1308         struct ip_conntrack_tuple_hash *h;
1309         struct ip_conntrack_tuple tuple;
1310         
1311         IP_CT_TUPLE_U_BLANK(&tuple);
1312         tuple.src.ip = inet->rcv_saddr;
1313         tuple.src.u.tcp.port = inet->sport;
1314         tuple.dst.ip = inet->daddr;
1315         tuple.dst.u.tcp.port = inet->dport;
1316         tuple.dst.protonum = IPPROTO_TCP;
1317
1318         /* We only do TCP at the moment: is there a better way? */
1319         if (strcmp(sk->sk_prot->name, "TCP")) {
1320                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1321                 return -ENOPROTOOPT;
1322         }
1323
1324         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1325                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1326                        *len, sizeof(struct sockaddr_in));
1327                 return -EINVAL;
1328         }
1329
1330         h = ip_conntrack_find_get(&tuple, NULL);
1331         if (h) {
1332                 struct sockaddr_in sin;
1333                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1334
1335                 sin.sin_family = AF_INET;
1336                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1337                         .tuple.dst.u.tcp.port;
1338                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1339                         .tuple.dst.ip;
1340
1341                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1342                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1343                 ip_conntrack_put(ct);
1344                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1345                         return -EFAULT;
1346                 else
1347                         return 0;
1348         }
1349         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1350                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1351                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1352         return -ENOENT;
1353 }
1354
1355 static struct nf_sockopt_ops so_getorigdst = {
1356         .pf             = PF_INET,
1357         .get_optmin     = SO_ORIGINAL_DST,
1358         .get_optmax     = SO_ORIGINAL_DST+1,
1359         .get            = &getorigdst,
1360 };
1361
1362 static int kill_all(struct ip_conntrack *i, void *data)
1363 {
1364         return 1;
1365 }
1366
1367 static void free_conntrack_hash(void)
1368 {
1369         if (ip_conntrack_vmalloc)
1370                 vfree(ip_conntrack_hash);
1371         else
1372                 free_pages((unsigned long)ip_conntrack_hash, 
1373                            get_order(sizeof(struct list_head)
1374                                      * ip_conntrack_htable_size));
1375 }
1376
1377 void ip_conntrack_flush()
1378 {
1379         /* This makes sure all current packets have passed through
1380            netfilter framework.  Roll on, two-stage module
1381            delete... */
1382         synchronize_net();
1383
1384  i_see_dead_people:
1385         ip_ct_iterate_cleanup(kill_all, NULL);
1386         if (atomic_read(&ip_conntrack_count) != 0) {
1387                 schedule();
1388                 goto i_see_dead_people;
1389         }
1390         /* wait until all references to ip_conntrack_untracked are dropped */
1391         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1392                 schedule();
1393 }
1394
1395 /* Mishearing the voices in his head, our hero wonders how he's
1396    supposed to kill the mall. */
1397 void ip_conntrack_cleanup(void)
1398 {
1399         ip_ct_attach = NULL;
1400         ip_conntrack_flush();
1401         kmem_cache_destroy(ip_conntrack_cachep);
1402         kmem_cache_destroy(ip_conntrack_expect_cachep);
1403         free_conntrack_hash();
1404         nf_unregister_sockopt(&so_getorigdst);
1405 }
1406
1407 static int hashsize;
1408 module_param(hashsize, int, 0400);
1409
1410 int __init ip_conntrack_init(void)
1411 {
1412         unsigned int i;
1413         int ret;
1414
1415         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1416          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1417         if (hashsize) {
1418                 ip_conntrack_htable_size = hashsize;
1419         } else {
1420                 ip_conntrack_htable_size
1421                         = (((num_physpages << PAGE_SHIFT) / 16384)
1422                            / sizeof(struct list_head));
1423                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1424                         ip_conntrack_htable_size = 8192;
1425                 if (ip_conntrack_htable_size < 16)
1426                         ip_conntrack_htable_size = 16;
1427         }
1428         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1429
1430         printk("ip_conntrack version %s (%u buckets, %d max)"
1431                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1432                ip_conntrack_htable_size, ip_conntrack_max,
1433                sizeof(struct ip_conntrack));
1434
1435         ret = nf_register_sockopt(&so_getorigdst);
1436         if (ret != 0) {
1437                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1438                 return ret;
1439         }
1440
1441         /* AK: the hash table is twice as big than needed because it
1442            uses list_head.  it would be much nicer to caches to use a
1443            single pointer list head here. */
1444         ip_conntrack_vmalloc = 0; 
1445         ip_conntrack_hash 
1446                 =(void*)__get_free_pages(GFP_KERNEL, 
1447                                          get_order(sizeof(struct list_head)
1448                                                    *ip_conntrack_htable_size));
1449         if (!ip_conntrack_hash) { 
1450                 ip_conntrack_vmalloc = 1;
1451                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1452                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1453                                             * ip_conntrack_htable_size);
1454         }
1455         if (!ip_conntrack_hash) {
1456                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1457                 goto err_unreg_sockopt;
1458         }
1459
1460         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1461                                                 sizeof(struct ip_conntrack), 0,
1462                                                 0, NULL, NULL);
1463         if (!ip_conntrack_cachep) {
1464                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1465                 goto err_free_hash;
1466         }
1467
1468         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1469                                         sizeof(struct ip_conntrack_expect),
1470                                         0, 0, NULL, NULL);
1471         if (!ip_conntrack_expect_cachep) {
1472                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1473                 goto err_free_conntrack_slab;
1474         }
1475
1476         /* Don't NEED lock here, but good form anyway. */
1477         write_lock_bh(&ip_conntrack_lock);
1478         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1479                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1480         /* Sew in builtin protocols. */
1481         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1482         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1483         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1484         write_unlock_bh(&ip_conntrack_lock);
1485
1486         for (i = 0; i < ip_conntrack_htable_size; i++)
1487                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1488
1489         /* For use by ipt_REJECT */
1490         ip_ct_attach = ip_conntrack_attach;
1491
1492         /* Set up fake conntrack:
1493             - to never be deleted, not in any hashes */
1494         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1495         /*  - and look it like as a confirmed connection */
1496         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1497
1498         return ret;
1499
1500 err_free_conntrack_slab:
1501         kmem_cache_destroy(ip_conntrack_cachep);
1502 err_free_hash:
1503         free_conntrack_hash();
1504 err_unreg_sockopt:
1505         nf_unregister_sockopt(&so_getorigdst);
1506
1507         return -ENOMEM;
1508 }