]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/ipmr.c
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[karo-tx-linux.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/net_namespace.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66 #include <net/netlink.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 static struct sock *mroute_socket;
73
74
75 /* Big lock, protecting vif table, mrt cache and mroute socket state.
76    Note that the changes are semaphored via rtnl_lock.
77  */
78
79 static DEFINE_RWLOCK(mrt_lock);
80
81 /*
82  *      Multicast router control variables
83  */
84
85 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
86 static int maxvif;
87
88 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
89
90 static int mroute_do_assert;                            /* Set in PIM assert    */
91 static int mroute_do_pim;
92
93 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
94
95 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
96 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
97
98 /* Special spinlock for queue of unresolved entries */
99 static DEFINE_SPINLOCK(mfc_unres_lock);
100
101 /* We return to original Alan's scheme. Hash table of resolved
102    entries is changed only in process context and protected
103    with weak lock mrt_lock. Queue of unresolved entries is protected
104    with strong spinlock mfc_unres_lock.
105
106    In this case data path is free of exclusive locks at all.
107  */
108
109 static struct kmem_cache *mrt_cachep __read_mostly;
110
111 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
112 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
113 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
114
115 #ifdef CONFIG_IP_PIMSM_V2
116 static struct net_protocol pim_protocol;
117 #endif
118
119 static struct timer_list ipmr_expire_timer;
120
121 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
122
123 static
124 struct net_device *ipmr_new_tunnel(struct vifctl *v)
125 {
126         struct net_device  *dev;
127
128         dev = __dev_get_by_name(&init_net, "tunl0");
129
130         if (dev) {
131                 int err;
132                 struct ifreq ifr;
133                 mm_segment_t    oldfs;
134                 struct ip_tunnel_parm p;
135                 struct in_device  *in_dev;
136
137                 memset(&p, 0, sizeof(p));
138                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
139                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
140                 p.iph.version = 4;
141                 p.iph.ihl = 5;
142                 p.iph.protocol = IPPROTO_IPIP;
143                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
144                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
145
146                 oldfs = get_fs(); set_fs(KERNEL_DS);
147                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
148                 set_fs(oldfs);
149
150                 dev = NULL;
151
152                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
153                         dev->flags |= IFF_MULTICAST;
154
155                         in_dev = __in_dev_get_rtnl(dev);
156                         if (in_dev == NULL)
157                                 goto failure;
158
159                         ipv4_devconf_setall(in_dev);
160                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
161
162                         if (dev_open(dev))
163                                 goto failure;
164                 }
165         }
166         return dev;
167
168 failure:
169         /* allow the register to be completed before unregistering. */
170         rtnl_unlock();
171         rtnl_lock();
172
173         unregister_netdevice(dev);
174         return NULL;
175 }
176
177 #ifdef CONFIG_IP_PIMSM
178
179 static int reg_vif_num = -1;
180
181 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
182 {
183         read_lock(&mrt_lock);
184         dev->stats.tx_bytes += skb->len;
185         dev->stats.tx_packets++;
186         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
187         read_unlock(&mrt_lock);
188         kfree_skb(skb);
189         return 0;
190 }
191
192 static void reg_vif_setup(struct net_device *dev)
193 {
194         dev->type               = ARPHRD_PIMREG;
195         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
196         dev->flags              = IFF_NOARP;
197         dev->hard_start_xmit    = reg_vif_xmit;
198         dev->destructor         = free_netdev;
199 }
200
201 static struct net_device *ipmr_reg_vif(void)
202 {
203         struct net_device *dev;
204         struct in_device *in_dev;
205
206         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
207
208         if (dev == NULL)
209                 return NULL;
210
211         if (register_netdevice(dev)) {
212                 free_netdev(dev);
213                 return NULL;
214         }
215         dev->iflink = 0;
216
217         rcu_read_lock();
218         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
219                 rcu_read_unlock();
220                 goto failure;
221         }
222
223         ipv4_devconf_setall(in_dev);
224         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
225         rcu_read_unlock();
226
227         if (dev_open(dev))
228                 goto failure;
229
230         return dev;
231
232 failure:
233         /* allow the register to be completed before unregistering. */
234         rtnl_unlock();
235         rtnl_lock();
236
237         unregister_netdevice(dev);
238         return NULL;
239 }
240 #endif
241
242 /*
243  *      Delete a VIF entry
244  */
245
246 static int vif_delete(int vifi)
247 {
248         struct vif_device *v;
249         struct net_device *dev;
250         struct in_device *in_dev;
251
252         if (vifi < 0 || vifi >= maxvif)
253                 return -EADDRNOTAVAIL;
254
255         v = &vif_table[vifi];
256
257         write_lock_bh(&mrt_lock);
258         dev = v->dev;
259         v->dev = NULL;
260
261         if (!dev) {
262                 write_unlock_bh(&mrt_lock);
263                 return -EADDRNOTAVAIL;
264         }
265
266 #ifdef CONFIG_IP_PIMSM
267         if (vifi == reg_vif_num)
268                 reg_vif_num = -1;
269 #endif
270
271         if (vifi+1 == maxvif) {
272                 int tmp;
273                 for (tmp=vifi-1; tmp>=0; tmp--) {
274                         if (VIF_EXISTS(tmp))
275                                 break;
276                 }
277                 maxvif = tmp+1;
278         }
279
280         write_unlock_bh(&mrt_lock);
281
282         dev_set_allmulti(dev, -1);
283
284         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
285                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
286                 ip_rt_multicast_event(in_dev);
287         }
288
289         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
290                 unregister_netdevice(dev);
291
292         dev_put(dev);
293         return 0;
294 }
295
296 /* Destroy an unresolved cache entry, killing queued skbs
297    and reporting error to netlink readers.
298  */
299
300 static void ipmr_destroy_unres(struct mfc_cache *c)
301 {
302         struct sk_buff *skb;
303         struct nlmsgerr *e;
304
305         atomic_dec(&cache_resolve_queue_len);
306
307         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
308                 if (ip_hdr(skb)->version == 0) {
309                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
310                         nlh->nlmsg_type = NLMSG_ERROR;
311                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
312                         skb_trim(skb, nlh->nlmsg_len);
313                         e = NLMSG_DATA(nlh);
314                         e->error = -ETIMEDOUT;
315                         memset(&e->msg, 0, sizeof(e->msg));
316
317                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
318                 } else
319                         kfree_skb(skb);
320         }
321
322         kmem_cache_free(mrt_cachep, c);
323 }
324
325
326 /* Single timer process for all the unresolved queue. */
327
328 static void ipmr_expire_process(unsigned long dummy)
329 {
330         unsigned long now;
331         unsigned long expires;
332         struct mfc_cache *c, **cp;
333
334         if (!spin_trylock(&mfc_unres_lock)) {
335                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
336                 return;
337         }
338
339         if (atomic_read(&cache_resolve_queue_len) == 0)
340                 goto out;
341
342         now = jiffies;
343         expires = 10*HZ;
344         cp = &mfc_unres_queue;
345
346         while ((c=*cp) != NULL) {
347                 if (time_after(c->mfc_un.unres.expires, now)) {
348                         unsigned long interval = c->mfc_un.unres.expires - now;
349                         if (interval < expires)
350                                 expires = interval;
351                         cp = &c->next;
352                         continue;
353                 }
354
355                 *cp = c->next;
356
357                 ipmr_destroy_unres(c);
358         }
359
360         if (atomic_read(&cache_resolve_queue_len))
361                 mod_timer(&ipmr_expire_timer, jiffies + expires);
362
363 out:
364         spin_unlock(&mfc_unres_lock);
365 }
366
367 /* Fill oifs list. It is called under write locked mrt_lock. */
368
369 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
370 {
371         int vifi;
372
373         cache->mfc_un.res.minvif = MAXVIFS;
374         cache->mfc_un.res.maxvif = 0;
375         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
376
377         for (vifi=0; vifi<maxvif; vifi++) {
378                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
379                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
380                         if (cache->mfc_un.res.minvif > vifi)
381                                 cache->mfc_un.res.minvif = vifi;
382                         if (cache->mfc_un.res.maxvif <= vifi)
383                                 cache->mfc_un.res.maxvif = vifi + 1;
384                 }
385         }
386 }
387
388 static int vif_add(struct vifctl *vifc, int mrtsock)
389 {
390         int vifi = vifc->vifc_vifi;
391         struct vif_device *v = &vif_table[vifi];
392         struct net_device *dev;
393         struct in_device *in_dev;
394
395         /* Is vif busy ? */
396         if (VIF_EXISTS(vifi))
397                 return -EADDRINUSE;
398
399         switch (vifc->vifc_flags) {
400 #ifdef CONFIG_IP_PIMSM
401         case VIFF_REGISTER:
402                 /*
403                  * Special Purpose VIF in PIM
404                  * All the packets will be sent to the daemon
405                  */
406                 if (reg_vif_num >= 0)
407                         return -EADDRINUSE;
408                 dev = ipmr_reg_vif();
409                 if (!dev)
410                         return -ENOBUFS;
411                 break;
412 #endif
413         case VIFF_TUNNEL:
414                 dev = ipmr_new_tunnel(vifc);
415                 if (!dev)
416                         return -ENOBUFS;
417                 break;
418         case 0:
419                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
420                 if (!dev)
421                         return -EADDRNOTAVAIL;
422                 dev_put(dev);
423                 break;
424         default:
425                 return -EINVAL;
426         }
427
428         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
429                 return -EADDRNOTAVAIL;
430         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
431         dev_set_allmulti(dev, +1);
432         ip_rt_multicast_event(in_dev);
433
434         /*
435          *      Fill in the VIF structures
436          */
437         v->rate_limit=vifc->vifc_rate_limit;
438         v->local=vifc->vifc_lcl_addr.s_addr;
439         v->remote=vifc->vifc_rmt_addr.s_addr;
440         v->flags=vifc->vifc_flags;
441         if (!mrtsock)
442                 v->flags |= VIFF_STATIC;
443         v->threshold=vifc->vifc_threshold;
444         v->bytes_in = 0;
445         v->bytes_out = 0;
446         v->pkt_in = 0;
447         v->pkt_out = 0;
448         v->link = dev->ifindex;
449         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
450                 v->link = dev->iflink;
451
452         /* And finish update writing critical data */
453         write_lock_bh(&mrt_lock);
454         dev_hold(dev);
455         v->dev=dev;
456 #ifdef CONFIG_IP_PIMSM
457         if (v->flags&VIFF_REGISTER)
458                 reg_vif_num = vifi;
459 #endif
460         if (vifi+1 > maxvif)
461                 maxvif = vifi+1;
462         write_unlock_bh(&mrt_lock);
463         return 0;
464 }
465
466 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
467 {
468         int line=MFC_HASH(mcastgrp,origin);
469         struct mfc_cache *c;
470
471         for (c=mfc_cache_array[line]; c; c = c->next) {
472                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
473                         break;
474         }
475         return c;
476 }
477
478 /*
479  *      Allocate a multicast cache entry
480  */
481 static struct mfc_cache *ipmr_cache_alloc(void)
482 {
483         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
484         if (c==NULL)
485                 return NULL;
486         c->mfc_un.res.minvif = MAXVIFS;
487         return c;
488 }
489
490 static struct mfc_cache *ipmr_cache_alloc_unres(void)
491 {
492         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
493         if (c==NULL)
494                 return NULL;
495         skb_queue_head_init(&c->mfc_un.unres.unresolved);
496         c->mfc_un.unres.expires = jiffies + 10*HZ;
497         return c;
498 }
499
500 /*
501  *      A cache entry has gone into a resolved state from queued
502  */
503
504 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
505 {
506         struct sk_buff *skb;
507         struct nlmsgerr *e;
508
509         /*
510          *      Play the pending entries through our router
511          */
512
513         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
514                 if (ip_hdr(skb)->version == 0) {
515                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
516
517                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
518                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
519                                                   (u8 *)nlh);
520                         } else {
521                                 nlh->nlmsg_type = NLMSG_ERROR;
522                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
523                                 skb_trim(skb, nlh->nlmsg_len);
524                                 e = NLMSG_DATA(nlh);
525                                 e->error = -EMSGSIZE;
526                                 memset(&e->msg, 0, sizeof(e->msg));
527                         }
528
529                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
530                 } else
531                         ip_mr_forward(skb, c, 0);
532         }
533 }
534
535 /*
536  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
537  *      expects the following bizarre scheme.
538  *
539  *      Called under mrt_lock.
540  */
541
542 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
543 {
544         struct sk_buff *skb;
545         const int ihl = ip_hdrlen(pkt);
546         struct igmphdr *igmp;
547         struct igmpmsg *msg;
548         int ret;
549
550 #ifdef CONFIG_IP_PIMSM
551         if (assert == IGMPMSG_WHOLEPKT)
552                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
553         else
554 #endif
555                 skb = alloc_skb(128, GFP_ATOMIC);
556
557         if (!skb)
558                 return -ENOBUFS;
559
560 #ifdef CONFIG_IP_PIMSM
561         if (assert == IGMPMSG_WHOLEPKT) {
562                 /* Ugly, but we have no choice with this interface.
563                    Duplicate old header, fix ihl, length etc.
564                    And all this only to mangle msg->im_msgtype and
565                    to set msg->im_mbz to "mbz" :-)
566                  */
567                 skb_push(skb, sizeof(struct iphdr));
568                 skb_reset_network_header(skb);
569                 skb_reset_transport_header(skb);
570                 msg = (struct igmpmsg *)skb_network_header(skb);
571                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
572                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
573                 msg->im_mbz = 0;
574                 msg->im_vif = reg_vif_num;
575                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
576                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
577                                              sizeof(struct iphdr));
578         } else
579 #endif
580         {
581
582         /*
583          *      Copy the IP header
584          */
585
586         skb->network_header = skb->tail;
587         skb_put(skb, ihl);
588         skb_copy_to_linear_data(skb, pkt->data, ihl);
589         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
590         msg = (struct igmpmsg *)skb_network_header(skb);
591         msg->im_vif = vifi;
592         skb->dst = dst_clone(pkt->dst);
593
594         /*
595          *      Add our header
596          */
597
598         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
599         igmp->type      =
600         msg->im_msgtype = assert;
601         igmp->code      =       0;
602         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
603         skb->transport_header = skb->network_header;
604         }
605
606         if (mroute_socket == NULL) {
607                 kfree_skb(skb);
608                 return -EINVAL;
609         }
610
611         /*
612          *      Deliver to mrouted
613          */
614         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
615                 if (net_ratelimit())
616                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
617                 kfree_skb(skb);
618         }
619
620         return ret;
621 }
622
623 /*
624  *      Queue a packet for resolution. It gets locked cache entry!
625  */
626
627 static int
628 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
629 {
630         int err;
631         struct mfc_cache *c;
632         const struct iphdr *iph = ip_hdr(skb);
633
634         spin_lock_bh(&mfc_unres_lock);
635         for (c=mfc_unres_queue; c; c=c->next) {
636                 if (c->mfc_mcastgrp == iph->daddr &&
637                     c->mfc_origin == iph->saddr)
638                         break;
639         }
640
641         if (c == NULL) {
642                 /*
643                  *      Create a new entry if allowable
644                  */
645
646                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
647                     (c=ipmr_cache_alloc_unres())==NULL) {
648                         spin_unlock_bh(&mfc_unres_lock);
649
650                         kfree_skb(skb);
651                         return -ENOBUFS;
652                 }
653
654                 /*
655                  *      Fill in the new cache entry
656                  */
657                 c->mfc_parent   = -1;
658                 c->mfc_origin   = iph->saddr;
659                 c->mfc_mcastgrp = iph->daddr;
660
661                 /*
662                  *      Reflect first query at mrouted.
663                  */
664                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
665                         /* If the report failed throw the cache entry
666                            out - Brad Parker
667                          */
668                         spin_unlock_bh(&mfc_unres_lock);
669
670                         kmem_cache_free(mrt_cachep, c);
671                         kfree_skb(skb);
672                         return err;
673                 }
674
675                 atomic_inc(&cache_resolve_queue_len);
676                 c->next = mfc_unres_queue;
677                 mfc_unres_queue = c;
678
679                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
680         }
681
682         /*
683          *      See if we can append the packet
684          */
685         if (c->mfc_un.unres.unresolved.qlen>3) {
686                 kfree_skb(skb);
687                 err = -ENOBUFS;
688         } else {
689                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
690                 err = 0;
691         }
692
693         spin_unlock_bh(&mfc_unres_lock);
694         return err;
695 }
696
697 /*
698  *      MFC cache manipulation by user space mroute daemon
699  */
700
701 static int ipmr_mfc_delete(struct mfcctl *mfc)
702 {
703         int line;
704         struct mfc_cache *c, **cp;
705
706         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
707
708         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
709                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
710                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
711                         write_lock_bh(&mrt_lock);
712                         *cp = c->next;
713                         write_unlock_bh(&mrt_lock);
714
715                         kmem_cache_free(mrt_cachep, c);
716                         return 0;
717                 }
718         }
719         return -ENOENT;
720 }
721
722 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
723 {
724         int line;
725         struct mfc_cache *uc, *c, **cp;
726
727         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
728
729         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
730                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
731                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
732                         break;
733         }
734
735         if (c != NULL) {
736                 write_lock_bh(&mrt_lock);
737                 c->mfc_parent = mfc->mfcc_parent;
738                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
739                 if (!mrtsock)
740                         c->mfc_flags |= MFC_STATIC;
741                 write_unlock_bh(&mrt_lock);
742                 return 0;
743         }
744
745         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
746                 return -EINVAL;
747
748         c=ipmr_cache_alloc();
749         if (c==NULL)
750                 return -ENOMEM;
751
752         c->mfc_origin=mfc->mfcc_origin.s_addr;
753         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
754         c->mfc_parent=mfc->mfcc_parent;
755         ipmr_update_thresholds(c, mfc->mfcc_ttls);
756         if (!mrtsock)
757                 c->mfc_flags |= MFC_STATIC;
758
759         write_lock_bh(&mrt_lock);
760         c->next = mfc_cache_array[line];
761         mfc_cache_array[line] = c;
762         write_unlock_bh(&mrt_lock);
763
764         /*
765          *      Check to see if we resolved a queued list. If so we
766          *      need to send on the frames and tidy up.
767          */
768         spin_lock_bh(&mfc_unres_lock);
769         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
770              cp = &uc->next) {
771                 if (uc->mfc_origin == c->mfc_origin &&
772                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
773                         *cp = uc->next;
774                         if (atomic_dec_and_test(&cache_resolve_queue_len))
775                                 del_timer(&ipmr_expire_timer);
776                         break;
777                 }
778         }
779         spin_unlock_bh(&mfc_unres_lock);
780
781         if (uc) {
782                 ipmr_cache_resolve(uc, c);
783                 kmem_cache_free(mrt_cachep, uc);
784         }
785         return 0;
786 }
787
788 /*
789  *      Close the multicast socket, and clear the vif tables etc
790  */
791
792 static void mroute_clean_tables(struct sock *sk)
793 {
794         int i;
795
796         /*
797          *      Shut down all active vif entries
798          */
799         for (i=0; i<maxvif; i++) {
800                 if (!(vif_table[i].flags&VIFF_STATIC))
801                         vif_delete(i);
802         }
803
804         /*
805          *      Wipe the cache
806          */
807         for (i=0;i<MFC_LINES;i++) {
808                 struct mfc_cache *c, **cp;
809
810                 cp = &mfc_cache_array[i];
811                 while ((c = *cp) != NULL) {
812                         if (c->mfc_flags&MFC_STATIC) {
813                                 cp = &c->next;
814                                 continue;
815                         }
816                         write_lock_bh(&mrt_lock);
817                         *cp = c->next;
818                         write_unlock_bh(&mrt_lock);
819
820                         kmem_cache_free(mrt_cachep, c);
821                 }
822         }
823
824         if (atomic_read(&cache_resolve_queue_len) != 0) {
825                 struct mfc_cache *c;
826
827                 spin_lock_bh(&mfc_unres_lock);
828                 while (mfc_unres_queue != NULL) {
829                         c = mfc_unres_queue;
830                         mfc_unres_queue = c->next;
831                         spin_unlock_bh(&mfc_unres_lock);
832
833                         ipmr_destroy_unres(c);
834
835                         spin_lock_bh(&mfc_unres_lock);
836                 }
837                 spin_unlock_bh(&mfc_unres_lock);
838         }
839 }
840
841 static void mrtsock_destruct(struct sock *sk)
842 {
843         rtnl_lock();
844         if (sk == mroute_socket) {
845                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
846
847                 write_lock_bh(&mrt_lock);
848                 mroute_socket=NULL;
849                 write_unlock_bh(&mrt_lock);
850
851                 mroute_clean_tables(sk);
852         }
853         rtnl_unlock();
854 }
855
856 /*
857  *      Socket options and virtual interface manipulation. The whole
858  *      virtual interface system is a complete heap, but unfortunately
859  *      that's how BSD mrouted happens to think. Maybe one day with a proper
860  *      MOSPF/PIM router set up we can clean this up.
861  */
862
863 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
864 {
865         int ret;
866         struct vifctl vif;
867         struct mfcctl mfc;
868
869         if (optname != MRT_INIT) {
870                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
871                         return -EACCES;
872         }
873
874         switch (optname) {
875         case MRT_INIT:
876                 if (sk->sk_type != SOCK_RAW ||
877                     inet_sk(sk)->num != IPPROTO_IGMP)
878                         return -EOPNOTSUPP;
879                 if (optlen!=sizeof(int))
880                         return -ENOPROTOOPT;
881
882                 rtnl_lock();
883                 if (mroute_socket) {
884                         rtnl_unlock();
885                         return -EADDRINUSE;
886                 }
887
888                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
889                 if (ret == 0) {
890                         write_lock_bh(&mrt_lock);
891                         mroute_socket=sk;
892                         write_unlock_bh(&mrt_lock);
893
894                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
895                 }
896                 rtnl_unlock();
897                 return ret;
898         case MRT_DONE:
899                 if (sk!=mroute_socket)
900                         return -EACCES;
901                 return ip_ra_control(sk, 0, NULL);
902         case MRT_ADD_VIF:
903         case MRT_DEL_VIF:
904                 if (optlen!=sizeof(vif))
905                         return -EINVAL;
906                 if (copy_from_user(&vif,optval,sizeof(vif)))
907                         return -EFAULT;
908                 if (vif.vifc_vifi >= MAXVIFS)
909                         return -ENFILE;
910                 rtnl_lock();
911                 if (optname==MRT_ADD_VIF) {
912                         ret = vif_add(&vif, sk==mroute_socket);
913                 } else {
914                         ret = vif_delete(vif.vifc_vifi);
915                 }
916                 rtnl_unlock();
917                 return ret;
918
919                 /*
920                  *      Manipulate the forwarding caches. These live
921                  *      in a sort of kernel/user symbiosis.
922                  */
923         case MRT_ADD_MFC:
924         case MRT_DEL_MFC:
925                 if (optlen!=sizeof(mfc))
926                         return -EINVAL;
927                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
928                         return -EFAULT;
929                 rtnl_lock();
930                 if (optname==MRT_DEL_MFC)
931                         ret = ipmr_mfc_delete(&mfc);
932                 else
933                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
934                 rtnl_unlock();
935                 return ret;
936                 /*
937                  *      Control PIM assert.
938                  */
939         case MRT_ASSERT:
940         {
941                 int v;
942                 if (get_user(v,(int __user *)optval))
943                         return -EFAULT;
944                 mroute_do_assert=(v)?1:0;
945                 return 0;
946         }
947 #ifdef CONFIG_IP_PIMSM
948         case MRT_PIM:
949         {
950                 int v;
951
952                 if (get_user(v,(int __user *)optval))
953                         return -EFAULT;
954                 v = (v) ? 1 : 0;
955
956                 rtnl_lock();
957                 ret = 0;
958                 if (v != mroute_do_pim) {
959                         mroute_do_pim = v;
960                         mroute_do_assert = v;
961 #ifdef CONFIG_IP_PIMSM_V2
962                         if (mroute_do_pim)
963                                 ret = inet_add_protocol(&pim_protocol,
964                                                         IPPROTO_PIM);
965                         else
966                                 ret = inet_del_protocol(&pim_protocol,
967                                                         IPPROTO_PIM);
968                         if (ret < 0)
969                                 ret = -EAGAIN;
970 #endif
971                 }
972                 rtnl_unlock();
973                 return ret;
974         }
975 #endif
976         /*
977          *      Spurious command, or MRT_VERSION which you cannot
978          *      set.
979          */
980         default:
981                 return -ENOPROTOOPT;
982         }
983 }
984
985 /*
986  *      Getsock opt support for the multicast routing system.
987  */
988
989 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
990 {
991         int olr;
992         int val;
993
994         if (optname!=MRT_VERSION &&
995 #ifdef CONFIG_IP_PIMSM
996            optname!=MRT_PIM &&
997 #endif
998            optname!=MRT_ASSERT)
999                 return -ENOPROTOOPT;
1000
1001         if (get_user(olr, optlen))
1002                 return -EFAULT;
1003
1004         olr = min_t(unsigned int, olr, sizeof(int));
1005         if (olr < 0)
1006                 return -EINVAL;
1007
1008         if (put_user(olr,optlen))
1009                 return -EFAULT;
1010         if (optname==MRT_VERSION)
1011                 val=0x0305;
1012 #ifdef CONFIG_IP_PIMSM
1013         else if (optname==MRT_PIM)
1014                 val=mroute_do_pim;
1015 #endif
1016         else
1017                 val=mroute_do_assert;
1018         if (copy_to_user(optval,&val,olr))
1019                 return -EFAULT;
1020         return 0;
1021 }
1022
1023 /*
1024  *      The IP multicast ioctl support routines.
1025  */
1026
1027 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1028 {
1029         struct sioc_sg_req sr;
1030         struct sioc_vif_req vr;
1031         struct vif_device *vif;
1032         struct mfc_cache *c;
1033
1034         switch (cmd) {
1035         case SIOCGETVIFCNT:
1036                 if (copy_from_user(&vr,arg,sizeof(vr)))
1037                         return -EFAULT;
1038                 if (vr.vifi>=maxvif)
1039                         return -EINVAL;
1040                 read_lock(&mrt_lock);
1041                 vif=&vif_table[vr.vifi];
1042                 if (VIF_EXISTS(vr.vifi))        {
1043                         vr.icount=vif->pkt_in;
1044                         vr.ocount=vif->pkt_out;
1045                         vr.ibytes=vif->bytes_in;
1046                         vr.obytes=vif->bytes_out;
1047                         read_unlock(&mrt_lock);
1048
1049                         if (copy_to_user(arg,&vr,sizeof(vr)))
1050                                 return -EFAULT;
1051                         return 0;
1052                 }
1053                 read_unlock(&mrt_lock);
1054                 return -EADDRNOTAVAIL;
1055         case SIOCGETSGCNT:
1056                 if (copy_from_user(&sr,arg,sizeof(sr)))
1057                         return -EFAULT;
1058
1059                 read_lock(&mrt_lock);
1060                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1061                 if (c) {
1062                         sr.pktcnt = c->mfc_un.res.pkt;
1063                         sr.bytecnt = c->mfc_un.res.bytes;
1064                         sr.wrong_if = c->mfc_un.res.wrong_if;
1065                         read_unlock(&mrt_lock);
1066
1067                         if (copy_to_user(arg,&sr,sizeof(sr)))
1068                                 return -EFAULT;
1069                         return 0;
1070                 }
1071                 read_unlock(&mrt_lock);
1072                 return -EADDRNOTAVAIL;
1073         default:
1074                 return -ENOIOCTLCMD;
1075         }
1076 }
1077
1078
1079 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1080 {
1081         struct net_device *dev = ptr;
1082         struct vif_device *v;
1083         int ct;
1084
1085         if (dev_net(dev) != &init_net)
1086                 return NOTIFY_DONE;
1087
1088         if (event != NETDEV_UNREGISTER)
1089                 return NOTIFY_DONE;
1090         v=&vif_table[0];
1091         for (ct=0;ct<maxvif;ct++,v++) {
1092                 if (v->dev==dev)
1093                         vif_delete(ct);
1094         }
1095         return NOTIFY_DONE;
1096 }
1097
1098
1099 static struct notifier_block ip_mr_notifier={
1100         .notifier_call = ipmr_device_event,
1101 };
1102
1103 /*
1104  *      Encapsulate a packet by attaching a valid IPIP header to it.
1105  *      This avoids tunnel drivers and other mess and gives us the speed so
1106  *      important for multicast video.
1107  */
1108
1109 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1110 {
1111         struct iphdr *iph;
1112         struct iphdr *old_iph = ip_hdr(skb);
1113
1114         skb_push(skb, sizeof(struct iphdr));
1115         skb->transport_header = skb->network_header;
1116         skb_reset_network_header(skb);
1117         iph = ip_hdr(skb);
1118
1119         iph->version    =       4;
1120         iph->tos        =       old_iph->tos;
1121         iph->ttl        =       old_iph->ttl;
1122         iph->frag_off   =       0;
1123         iph->daddr      =       daddr;
1124         iph->saddr      =       saddr;
1125         iph->protocol   =       IPPROTO_IPIP;
1126         iph->ihl        =       5;
1127         iph->tot_len    =       htons(skb->len);
1128         ip_select_ident(iph, skb->dst, NULL);
1129         ip_send_check(iph);
1130
1131         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1132         nf_reset(skb);
1133 }
1134
1135 static inline int ipmr_forward_finish(struct sk_buff *skb)
1136 {
1137         struct ip_options * opt = &(IPCB(skb)->opt);
1138
1139         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1140
1141         if (unlikely(opt->optlen))
1142                 ip_forward_options(skb);
1143
1144         return dst_output(skb);
1145 }
1146
1147 /*
1148  *      Processing handlers for ipmr_forward
1149  */
1150
1151 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1152 {
1153         const struct iphdr *iph = ip_hdr(skb);
1154         struct vif_device *vif = &vif_table[vifi];
1155         struct net_device *dev;
1156         struct rtable *rt;
1157         int    encap = 0;
1158
1159         if (vif->dev == NULL)
1160                 goto out_free;
1161
1162 #ifdef CONFIG_IP_PIMSM
1163         if (vif->flags & VIFF_REGISTER) {
1164                 vif->pkt_out++;
1165                 vif->bytes_out+=skb->len;
1166                 vif->dev->stats.tx_bytes += skb->len;
1167                 vif->dev->stats.tx_packets++;
1168                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1169                 kfree_skb(skb);
1170                 return;
1171         }
1172 #endif
1173
1174         if (vif->flags&VIFF_TUNNEL) {
1175                 struct flowi fl = { .oif = vif->link,
1176                                     .nl_u = { .ip4_u =
1177                                               { .daddr = vif->remote,
1178                                                 .saddr = vif->local,
1179                                                 .tos = RT_TOS(iph->tos) } },
1180                                     .proto = IPPROTO_IPIP };
1181                 if (ip_route_output_key(&init_net, &rt, &fl))
1182                         goto out_free;
1183                 encap = sizeof(struct iphdr);
1184         } else {
1185                 struct flowi fl = { .oif = vif->link,
1186                                     .nl_u = { .ip4_u =
1187                                               { .daddr = iph->daddr,
1188                                                 .tos = RT_TOS(iph->tos) } },
1189                                     .proto = IPPROTO_IPIP };
1190                 if (ip_route_output_key(&init_net, &rt, &fl))
1191                         goto out_free;
1192         }
1193
1194         dev = rt->u.dst.dev;
1195
1196         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1197                 /* Do not fragment multicasts. Alas, IPv4 does not
1198                    allow to send ICMP, so that packets will disappear
1199                    to blackhole.
1200                  */
1201
1202                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1203                 ip_rt_put(rt);
1204                 goto out_free;
1205         }
1206
1207         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1208
1209         if (skb_cow(skb, encap)) {
1210                 ip_rt_put(rt);
1211                 goto out_free;
1212         }
1213
1214         vif->pkt_out++;
1215         vif->bytes_out+=skb->len;
1216
1217         dst_release(skb->dst);
1218         skb->dst = &rt->u.dst;
1219         ip_decrease_ttl(ip_hdr(skb));
1220
1221         /* FIXME: forward and output firewalls used to be called here.
1222          * What do we do with netfilter? -- RR */
1223         if (vif->flags & VIFF_TUNNEL) {
1224                 ip_encap(skb, vif->local, vif->remote);
1225                 /* FIXME: extra output firewall step used to be here. --RR */
1226                 vif->dev->stats.tx_packets++;
1227                 vif->dev->stats.tx_bytes += skb->len;
1228         }
1229
1230         IPCB(skb)->flags |= IPSKB_FORWARDED;
1231
1232         /*
1233          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1234          * not only before forwarding, but after forwarding on all output
1235          * interfaces. It is clear, if mrouter runs a multicasting
1236          * program, it should receive packets not depending to what interface
1237          * program is joined.
1238          * If we will not make it, the program will have to join on all
1239          * interfaces. On the other hand, multihoming host (or router, but
1240          * not mrouter) cannot join to more than one interface - it will
1241          * result in receiving multiple packets.
1242          */
1243         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1244                 ipmr_forward_finish);
1245         return;
1246
1247 out_free:
1248         kfree_skb(skb);
1249         return;
1250 }
1251
1252 static int ipmr_find_vif(struct net_device *dev)
1253 {
1254         int ct;
1255         for (ct=maxvif-1; ct>=0; ct--) {
1256                 if (vif_table[ct].dev == dev)
1257                         break;
1258         }
1259         return ct;
1260 }
1261
1262 /* "local" means that we should preserve one skb (for local delivery) */
1263
1264 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1265 {
1266         int psend = -1;
1267         int vif, ct;
1268
1269         vif = cache->mfc_parent;
1270         cache->mfc_un.res.pkt++;
1271         cache->mfc_un.res.bytes += skb->len;
1272
1273         /*
1274          * Wrong interface: drop packet and (maybe) send PIM assert.
1275          */
1276         if (vif_table[vif].dev != skb->dev) {
1277                 int true_vifi;
1278
1279                 if (skb->rtable->fl.iif == 0) {
1280                         /* It is our own packet, looped back.
1281                            Very complicated situation...
1282
1283                            The best workaround until routing daemons will be
1284                            fixed is not to redistribute packet, if it was
1285                            send through wrong interface. It means, that
1286                            multicast applications WILL NOT work for
1287                            (S,G), which have default multicast route pointing
1288                            to wrong oif. In any case, it is not a good
1289                            idea to use multicasting applications on router.
1290                          */
1291                         goto dont_forward;
1292                 }
1293
1294                 cache->mfc_un.res.wrong_if++;
1295                 true_vifi = ipmr_find_vif(skb->dev);
1296
1297                 if (true_vifi >= 0 && mroute_do_assert &&
1298                     /* pimsm uses asserts, when switching from RPT to SPT,
1299                        so that we cannot check that packet arrived on an oif.
1300                        It is bad, but otherwise we would need to move pretty
1301                        large chunk of pimd to kernel. Ough... --ANK
1302                      */
1303                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1304                     time_after(jiffies,
1305                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1306                         cache->mfc_un.res.last_assert = jiffies;
1307                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1308                 }
1309                 goto dont_forward;
1310         }
1311
1312         vif_table[vif].pkt_in++;
1313         vif_table[vif].bytes_in+=skb->len;
1314
1315         /*
1316          *      Forward the frame
1317          */
1318         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1319                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1320                         if (psend != -1) {
1321                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1322                                 if (skb2)
1323                                         ipmr_queue_xmit(skb2, cache, psend);
1324                         }
1325                         psend=ct;
1326                 }
1327         }
1328         if (psend != -1) {
1329                 if (local) {
1330                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1331                         if (skb2)
1332                                 ipmr_queue_xmit(skb2, cache, psend);
1333                 } else {
1334                         ipmr_queue_xmit(skb, cache, psend);
1335                         return 0;
1336                 }
1337         }
1338
1339 dont_forward:
1340         if (!local)
1341                 kfree_skb(skb);
1342         return 0;
1343 }
1344
1345
1346 /*
1347  *      Multicast packets for forwarding arrive here
1348  */
1349
1350 int ip_mr_input(struct sk_buff *skb)
1351 {
1352         struct mfc_cache *cache;
1353         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1354
1355         /* Packet is looped back after forward, it should not be
1356            forwarded second time, but still can be delivered locally.
1357          */
1358         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1359                 goto dont_forward;
1360
1361         if (!local) {
1362                     if (IPCB(skb)->opt.router_alert) {
1363                             if (ip_call_ra_chain(skb))
1364                                     return 0;
1365                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1366                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1367                                Cisco IOS <= 11.2(8)) do not put router alert
1368                                option to IGMP packets destined to routable
1369                                groups. It is very bad, because it means
1370                                that we can forward NO IGMP messages.
1371                              */
1372                             read_lock(&mrt_lock);
1373                             if (mroute_socket) {
1374                                     nf_reset(skb);
1375                                     raw_rcv(mroute_socket, skb);
1376                                     read_unlock(&mrt_lock);
1377                                     return 0;
1378                             }
1379                             read_unlock(&mrt_lock);
1380                     }
1381         }
1382
1383         read_lock(&mrt_lock);
1384         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1385
1386         /*
1387          *      No usable cache entry
1388          */
1389         if (cache==NULL) {
1390                 int vif;
1391
1392                 if (local) {
1393                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1394                         ip_local_deliver(skb);
1395                         if (skb2 == NULL) {
1396                                 read_unlock(&mrt_lock);
1397                                 return -ENOBUFS;
1398                         }
1399                         skb = skb2;
1400                 }
1401
1402                 vif = ipmr_find_vif(skb->dev);
1403                 if (vif >= 0) {
1404                         int err = ipmr_cache_unresolved(vif, skb);
1405                         read_unlock(&mrt_lock);
1406
1407                         return err;
1408                 }
1409                 read_unlock(&mrt_lock);
1410                 kfree_skb(skb);
1411                 return -ENODEV;
1412         }
1413
1414         ip_mr_forward(skb, cache, local);
1415
1416         read_unlock(&mrt_lock);
1417
1418         if (local)
1419                 return ip_local_deliver(skb);
1420
1421         return 0;
1422
1423 dont_forward:
1424         if (local)
1425                 return ip_local_deliver(skb);
1426         kfree_skb(skb);
1427         return 0;
1428 }
1429
1430 #ifdef CONFIG_IP_PIMSM_V1
1431 /*
1432  * Handle IGMP messages of PIMv1
1433  */
1434
1435 int pim_rcv_v1(struct sk_buff * skb)
1436 {
1437         struct igmphdr *pim;
1438         struct iphdr   *encap;
1439         struct net_device  *reg_dev = NULL;
1440
1441         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1442                 goto drop;
1443
1444         pim = igmp_hdr(skb);
1445
1446         if (!mroute_do_pim ||
1447             skb->len < sizeof(*pim) + sizeof(*encap) ||
1448             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1449                 goto drop;
1450
1451         encap = (struct iphdr *)(skb_transport_header(skb) +
1452                                  sizeof(struct igmphdr));
1453         /*
1454            Check that:
1455            a. packet is really destinted to a multicast group
1456            b. packet is not a NULL-REGISTER
1457            c. packet is not truncated
1458          */
1459         if (!ipv4_is_multicast(encap->daddr) ||
1460             encap->tot_len == 0 ||
1461             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1462                 goto drop;
1463
1464         read_lock(&mrt_lock);
1465         if (reg_vif_num >= 0)
1466                 reg_dev = vif_table[reg_vif_num].dev;
1467         if (reg_dev)
1468                 dev_hold(reg_dev);
1469         read_unlock(&mrt_lock);
1470
1471         if (reg_dev == NULL)
1472                 goto drop;
1473
1474         skb->mac_header = skb->network_header;
1475         skb_pull(skb, (u8*)encap - skb->data);
1476         skb_reset_network_header(skb);
1477         skb->dev = reg_dev;
1478         skb->protocol = htons(ETH_P_IP);
1479         skb->ip_summed = 0;
1480         skb->pkt_type = PACKET_HOST;
1481         dst_release(skb->dst);
1482         skb->dst = NULL;
1483         reg_dev->stats.rx_bytes += skb->len;
1484         reg_dev->stats.rx_packets++;
1485         nf_reset(skb);
1486         netif_rx(skb);
1487         dev_put(reg_dev);
1488         return 0;
1489  drop:
1490         kfree_skb(skb);
1491         return 0;
1492 }
1493 #endif
1494
1495 #ifdef CONFIG_IP_PIMSM_V2
1496 static int pim_rcv(struct sk_buff * skb)
1497 {
1498         struct pimreghdr *pim;
1499         struct iphdr   *encap;
1500         struct net_device  *reg_dev = NULL;
1501
1502         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1503                 goto drop;
1504
1505         pim = (struct pimreghdr *)skb_transport_header(skb);
1506         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1507             (pim->flags&PIM_NULL_REGISTER) ||
1508             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1509              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1510                 goto drop;
1511
1512         /* check if the inner packet is destined to mcast group */
1513         encap = (struct iphdr *)(skb_transport_header(skb) +
1514                                  sizeof(struct pimreghdr));
1515         if (!ipv4_is_multicast(encap->daddr) ||
1516             encap->tot_len == 0 ||
1517             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1518                 goto drop;
1519
1520         read_lock(&mrt_lock);
1521         if (reg_vif_num >= 0)
1522                 reg_dev = vif_table[reg_vif_num].dev;
1523         if (reg_dev)
1524                 dev_hold(reg_dev);
1525         read_unlock(&mrt_lock);
1526
1527         if (reg_dev == NULL)
1528                 goto drop;
1529
1530         skb->mac_header = skb->network_header;
1531         skb_pull(skb, (u8*)encap - skb->data);
1532         skb_reset_network_header(skb);
1533         skb->dev = reg_dev;
1534         skb->protocol = htons(ETH_P_IP);
1535         skb->ip_summed = 0;
1536         skb->pkt_type = PACKET_HOST;
1537         dst_release(skb->dst);
1538         reg_dev->stats.rx_bytes += skb->len;
1539         reg_dev->stats.rx_packets++;
1540         skb->dst = NULL;
1541         nf_reset(skb);
1542         netif_rx(skb);
1543         dev_put(reg_dev);
1544         return 0;
1545  drop:
1546         kfree_skb(skb);
1547         return 0;
1548 }
1549 #endif
1550
1551 static int
1552 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1553 {
1554         int ct;
1555         struct rtnexthop *nhp;
1556         struct net_device *dev = vif_table[c->mfc_parent].dev;
1557         u8 *b = skb_tail_pointer(skb);
1558         struct rtattr *mp_head;
1559
1560         if (dev)
1561                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1562
1563         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1564
1565         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1566                 if (c->mfc_un.res.ttls[ct] < 255) {
1567                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1568                                 goto rtattr_failure;
1569                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1570                         nhp->rtnh_flags = 0;
1571                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1572                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1573                         nhp->rtnh_len = sizeof(*nhp);
1574                 }
1575         }
1576         mp_head->rta_type = RTA_MULTIPATH;
1577         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1578         rtm->rtm_type = RTN_MULTICAST;
1579         return 1;
1580
1581 rtattr_failure:
1582         nlmsg_trim(skb, b);
1583         return -EMSGSIZE;
1584 }
1585
1586 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1587 {
1588         int err;
1589         struct mfc_cache *cache;
1590         struct rtable *rt = skb->rtable;
1591
1592         read_lock(&mrt_lock);
1593         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1594
1595         if (cache==NULL) {
1596                 struct sk_buff *skb2;
1597                 struct iphdr *iph;
1598                 struct net_device *dev;
1599                 int vif;
1600
1601                 if (nowait) {
1602                         read_unlock(&mrt_lock);
1603                         return -EAGAIN;
1604                 }
1605
1606                 dev = skb->dev;
1607                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1608                         read_unlock(&mrt_lock);
1609                         return -ENODEV;
1610                 }
1611                 skb2 = skb_clone(skb, GFP_ATOMIC);
1612                 if (!skb2) {
1613                         read_unlock(&mrt_lock);
1614                         return -ENOMEM;
1615                 }
1616
1617                 skb_push(skb2, sizeof(struct iphdr));
1618                 skb_reset_network_header(skb2);
1619                 iph = ip_hdr(skb2);
1620                 iph->ihl = sizeof(struct iphdr) >> 2;
1621                 iph->saddr = rt->rt_src;
1622                 iph->daddr = rt->rt_dst;
1623                 iph->version = 0;
1624                 err = ipmr_cache_unresolved(vif, skb2);
1625                 read_unlock(&mrt_lock);
1626                 return err;
1627         }
1628
1629         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1630                 cache->mfc_flags |= MFC_NOTIFY;
1631         err = ipmr_fill_mroute(skb, cache, rtm);
1632         read_unlock(&mrt_lock);
1633         return err;
1634 }
1635
1636 #ifdef CONFIG_PROC_FS
1637 /*
1638  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1639  */
1640 struct ipmr_vif_iter {
1641         int ct;
1642 };
1643
1644 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1645                                            loff_t pos)
1646 {
1647         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1648                 if (!VIF_EXISTS(iter->ct))
1649                         continue;
1650                 if (pos-- == 0)
1651                         return &vif_table[iter->ct];
1652         }
1653         return NULL;
1654 }
1655
1656 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1657         __acquires(mrt_lock)
1658 {
1659         read_lock(&mrt_lock);
1660         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1661                 : SEQ_START_TOKEN;
1662 }
1663
1664 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1665 {
1666         struct ipmr_vif_iter *iter = seq->private;
1667
1668         ++*pos;
1669         if (v == SEQ_START_TOKEN)
1670                 return ipmr_vif_seq_idx(iter, 0);
1671
1672         while (++iter->ct < maxvif) {
1673                 if (!VIF_EXISTS(iter->ct))
1674                         continue;
1675                 return &vif_table[iter->ct];
1676         }
1677         return NULL;
1678 }
1679
1680 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1681         __releases(mrt_lock)
1682 {
1683         read_unlock(&mrt_lock);
1684 }
1685
1686 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1687 {
1688         if (v == SEQ_START_TOKEN) {
1689                 seq_puts(seq,
1690                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1691         } else {
1692                 const struct vif_device *vif = v;
1693                 const char *name =  vif->dev ? vif->dev->name : "none";
1694
1695                 seq_printf(seq,
1696                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1697                            vif - vif_table,
1698                            name, vif->bytes_in, vif->pkt_in,
1699                            vif->bytes_out, vif->pkt_out,
1700                            vif->flags, vif->local, vif->remote);
1701         }
1702         return 0;
1703 }
1704
1705 static const struct seq_operations ipmr_vif_seq_ops = {
1706         .start = ipmr_vif_seq_start,
1707         .next  = ipmr_vif_seq_next,
1708         .stop  = ipmr_vif_seq_stop,
1709         .show  = ipmr_vif_seq_show,
1710 };
1711
1712 static int ipmr_vif_open(struct inode *inode, struct file *file)
1713 {
1714         return seq_open_private(file, &ipmr_vif_seq_ops,
1715                         sizeof(struct ipmr_vif_iter));
1716 }
1717
1718 static const struct file_operations ipmr_vif_fops = {
1719         .owner   = THIS_MODULE,
1720         .open    = ipmr_vif_open,
1721         .read    = seq_read,
1722         .llseek  = seq_lseek,
1723         .release = seq_release_private,
1724 };
1725
1726 struct ipmr_mfc_iter {
1727         struct mfc_cache **cache;
1728         int ct;
1729 };
1730
1731
1732 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1733 {
1734         struct mfc_cache *mfc;
1735
1736         it->cache = mfc_cache_array;
1737         read_lock(&mrt_lock);
1738         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1739                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1740                         if (pos-- == 0)
1741                                 return mfc;
1742         read_unlock(&mrt_lock);
1743
1744         it->cache = &mfc_unres_queue;
1745         spin_lock_bh(&mfc_unres_lock);
1746         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1747                 if (pos-- == 0)
1748                         return mfc;
1749         spin_unlock_bh(&mfc_unres_lock);
1750
1751         it->cache = NULL;
1752         return NULL;
1753 }
1754
1755
1756 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1757 {
1758         struct ipmr_mfc_iter *it = seq->private;
1759         it->cache = NULL;
1760         it->ct = 0;
1761         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1762                 : SEQ_START_TOKEN;
1763 }
1764
1765 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1766 {
1767         struct mfc_cache *mfc = v;
1768         struct ipmr_mfc_iter *it = seq->private;
1769
1770         ++*pos;
1771
1772         if (v == SEQ_START_TOKEN)
1773                 return ipmr_mfc_seq_idx(seq->private, 0);
1774
1775         if (mfc->next)
1776                 return mfc->next;
1777
1778         if (it->cache == &mfc_unres_queue)
1779                 goto end_of_list;
1780
1781         BUG_ON(it->cache != mfc_cache_array);
1782
1783         while (++it->ct < MFC_LINES) {
1784                 mfc = mfc_cache_array[it->ct];
1785                 if (mfc)
1786                         return mfc;
1787         }
1788
1789         /* exhausted cache_array, show unresolved */
1790         read_unlock(&mrt_lock);
1791         it->cache = &mfc_unres_queue;
1792         it->ct = 0;
1793
1794         spin_lock_bh(&mfc_unres_lock);
1795         mfc = mfc_unres_queue;
1796         if (mfc)
1797                 return mfc;
1798
1799  end_of_list:
1800         spin_unlock_bh(&mfc_unres_lock);
1801         it->cache = NULL;
1802
1803         return NULL;
1804 }
1805
1806 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1807 {
1808         struct ipmr_mfc_iter *it = seq->private;
1809
1810         if (it->cache == &mfc_unres_queue)
1811                 spin_unlock_bh(&mfc_unres_lock);
1812         else if (it->cache == mfc_cache_array)
1813                 read_unlock(&mrt_lock);
1814 }
1815
1816 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1817 {
1818         int n;
1819
1820         if (v == SEQ_START_TOKEN) {
1821                 seq_puts(seq,
1822                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1823         } else {
1824                 const struct mfc_cache *mfc = v;
1825                 const struct ipmr_mfc_iter *it = seq->private;
1826
1827                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1828                            (unsigned long) mfc->mfc_mcastgrp,
1829                            (unsigned long) mfc->mfc_origin,
1830                            mfc->mfc_parent,
1831                            mfc->mfc_un.res.pkt,
1832                            mfc->mfc_un.res.bytes,
1833                            mfc->mfc_un.res.wrong_if);
1834
1835                 if (it->cache != &mfc_unres_queue) {
1836                         for (n = mfc->mfc_un.res.minvif;
1837                              n < mfc->mfc_un.res.maxvif; n++ ) {
1838                                 if (VIF_EXISTS(n)
1839                                    && mfc->mfc_un.res.ttls[n] < 255)
1840                                 seq_printf(seq,
1841                                            " %2d:%-3d",
1842                                            n, mfc->mfc_un.res.ttls[n]);
1843                         }
1844                 }
1845                 seq_putc(seq, '\n');
1846         }
1847         return 0;
1848 }
1849
1850 static const struct seq_operations ipmr_mfc_seq_ops = {
1851         .start = ipmr_mfc_seq_start,
1852         .next  = ipmr_mfc_seq_next,
1853         .stop  = ipmr_mfc_seq_stop,
1854         .show  = ipmr_mfc_seq_show,
1855 };
1856
1857 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1858 {
1859         return seq_open_private(file, &ipmr_mfc_seq_ops,
1860                         sizeof(struct ipmr_mfc_iter));
1861 }
1862
1863 static const struct file_operations ipmr_mfc_fops = {
1864         .owner   = THIS_MODULE,
1865         .open    = ipmr_mfc_open,
1866         .read    = seq_read,
1867         .llseek  = seq_lseek,
1868         .release = seq_release_private,
1869 };
1870 #endif
1871
1872 #ifdef CONFIG_IP_PIMSM_V2
1873 static struct net_protocol pim_protocol = {
1874         .handler        =       pim_rcv,
1875 };
1876 #endif
1877
1878
1879 /*
1880  *      Setup for IP multicast routing
1881  */
1882
1883 void __init ip_mr_init(void)
1884 {
1885         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1886                                        sizeof(struct mfc_cache),
1887                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1888                                        NULL);
1889         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1890         register_netdevice_notifier(&ip_mr_notifier);
1891 #ifdef CONFIG_PROC_FS
1892         proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
1893         proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops);
1894 #endif
1895 }