2 * IP multicast routing support for mrouted 3.6/3.8
4 * (c) 1995 Alan Cox, <alan@redhat.com>
5 * Linux Consultancy and Custom Driver Development
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
12 * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
15 * Michael Chastain : Incorrect size of copying.
16 * Alan Cox : Added the cache manager code
17 * Alan Cox : Fixed the clone/copy bug and device race.
18 * Mike McLagan : Routing by source
19 * Malcolm Beattie : Buffer handling fixes.
20 * Alexey Kuznetsov : Double buffer free and other fixes.
21 * SVR Anand : Fixed several multicast bugs and problems.
22 * Alexey Kuznetsov : Status, optimisations and more.
23 * Brad Parker : Better behaviour on mrouted upcall
25 * Carlos Picoto : PIMv1 Support
26 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
27 * Relax this requrement to work with older peers.
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
63 #include <net/checksum.h>
65 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
66 #define CONFIG_IP_PIMSM 1
69 static struct sock *mroute_socket;
72 /* Big lock, protecting vif table, mrt cache and mroute socket state.
73 Note that the changes are semaphored via rtnl_lock.
76 static DEFINE_RWLOCK(mrt_lock);
79 * Multicast router control variables
82 static struct vif_device vif_table[MAXVIFS]; /* Devices */
85 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87 static int mroute_do_assert; /* Set in PIM assert */
88 static int mroute_do_pim;
90 static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */
92 static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
93 static atomic_t cache_resolve_queue_len; /* Size of unresolved */
95 /* Special spinlock for queue of unresolved entries */
96 static DEFINE_SPINLOCK(mfc_unres_lock);
98 /* We return to original Alan's scheme. Hash table of resolved
99 entries is changed only in process context and protected
100 with weak lock mrt_lock. Queue of unresolved entries is protected
101 with strong spinlock mfc_unres_lock.
103 In this case data path is free of exclusive locks at all.
106 static kmem_cache_t *mrt_cachep;
108 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
110 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112 #ifdef CONFIG_IP_PIMSM_V2
113 static struct net_protocol pim_protocol;
116 static struct timer_list ipmr_expire_timer;
118 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121 struct net_device *ipmr_new_tunnel(struct vifctl *v)
123 struct net_device *dev;
125 dev = __dev_get_by_name("tunl0");
131 struct ip_tunnel_parm p;
132 struct in_device *in_dev;
134 memset(&p, 0, sizeof(p));
135 p.iph.daddr = v->vifc_rmt_addr.s_addr;
136 p.iph.saddr = v->vifc_lcl_addr.s_addr;
139 p.iph.protocol = IPPROTO_IPIP;
140 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
141 ifr.ifr_ifru.ifru_data = (void*)&p;
143 oldfs = get_fs(); set_fs(KERNEL_DS);
144 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
149 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
150 dev->flags |= IFF_MULTICAST;
152 in_dev = __in_dev_get(dev);
153 if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
155 in_dev->cnf.rp_filter = 0;
164 /* allow the register to be completed before unregistering. */
168 unregister_netdevice(dev);
172 #ifdef CONFIG_IP_PIMSM
174 static int reg_vif_num = -1;
176 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
178 read_lock(&mrt_lock);
179 ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
180 ((struct net_device_stats*)dev->priv)->tx_packets++;
181 ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
182 read_unlock(&mrt_lock);
187 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
189 return (struct net_device_stats*)dev->priv;
192 static void reg_vif_setup(struct net_device *dev)
194 dev->type = ARPHRD_PIMREG;
195 dev->mtu = 1500 - sizeof(struct iphdr) - 8;
196 dev->flags = IFF_NOARP;
197 dev->hard_start_xmit = reg_vif_xmit;
198 dev->get_stats = reg_vif_get_stats;
199 dev->destructor = free_netdev;
202 static struct net_device *ipmr_reg_vif(void)
204 struct net_device *dev;
205 struct in_device *in_dev;
207 dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
213 if (register_netdevice(dev)) {
219 if ((in_dev = inetdev_init(dev)) == NULL)
222 in_dev->cnf.rp_filter = 0;
230 /* allow the register to be completed before unregistering. */
234 unregister_netdevice(dev);
243 static int vif_delete(int vifi)
245 struct vif_device *v;
246 struct net_device *dev;
247 struct in_device *in_dev;
249 if (vifi < 0 || vifi >= maxvif)
250 return -EADDRNOTAVAIL;
252 v = &vif_table[vifi];
254 write_lock_bh(&mrt_lock);
259 write_unlock_bh(&mrt_lock);
260 return -EADDRNOTAVAIL;
263 #ifdef CONFIG_IP_PIMSM
264 if (vifi == reg_vif_num)
268 if (vifi+1 == maxvif) {
270 for (tmp=vifi-1; tmp>=0; tmp--) {
277 write_unlock_bh(&mrt_lock);
279 dev_set_allmulti(dev, -1);
281 if ((in_dev = __in_dev_get(dev)) != NULL) {
282 in_dev->cnf.mc_forwarding--;
283 ip_rt_multicast_event(in_dev);
286 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
287 unregister_netdevice(dev);
293 /* Destroy an unresolved cache entry, killing queued skbs
294 and reporting error to netlink readers.
297 static void ipmr_destroy_unres(struct mfc_cache *c)
301 atomic_dec(&cache_resolve_queue_len);
303 while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
304 if (skb->nh.iph->version == 0) {
305 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
306 nlh->nlmsg_type = NLMSG_ERROR;
307 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308 skb_trim(skb, nlh->nlmsg_len);
309 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
310 netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
315 kmem_cache_free(mrt_cachep, c);
319 /* Single timer process for all the unresolved queue. */
321 static void ipmr_expire_process(unsigned long dummy)
324 unsigned long expires;
325 struct mfc_cache *c, **cp;
327 if (!spin_trylock(&mfc_unres_lock)) {
328 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
332 if (atomic_read(&cache_resolve_queue_len) == 0)
337 cp = &mfc_unres_queue;
339 while ((c=*cp) != NULL) {
340 if (time_after(c->mfc_un.unres.expires, now)) {
341 unsigned long interval = c->mfc_un.unres.expires - now;
342 if (interval < expires)
350 ipmr_destroy_unres(c);
353 if (atomic_read(&cache_resolve_queue_len))
354 mod_timer(&ipmr_expire_timer, jiffies + expires);
357 spin_unlock(&mfc_unres_lock);
360 /* Fill oifs list. It is called under write locked mrt_lock. */
362 static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
366 cache->mfc_un.res.minvif = MAXVIFS;
367 cache->mfc_un.res.maxvif = 0;
368 memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
370 for (vifi=0; vifi<maxvif; vifi++) {
371 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
372 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
373 if (cache->mfc_un.res.minvif > vifi)
374 cache->mfc_un.res.minvif = vifi;
375 if (cache->mfc_un.res.maxvif <= vifi)
376 cache->mfc_un.res.maxvif = vifi + 1;
381 static int vif_add(struct vifctl *vifc, int mrtsock)
383 int vifi = vifc->vifc_vifi;
384 struct vif_device *v = &vif_table[vifi];
385 struct net_device *dev;
386 struct in_device *in_dev;
389 if (VIF_EXISTS(vifi))
392 switch (vifc->vifc_flags) {
393 #ifdef CONFIG_IP_PIMSM
396 * Special Purpose VIF in PIM
397 * All the packets will be sent to the daemon
399 if (reg_vif_num >= 0)
401 dev = ipmr_reg_vif();
407 dev = ipmr_new_tunnel(vifc);
412 dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
414 return -EADDRNOTAVAIL;
421 if ((in_dev = __in_dev_get(dev)) == NULL)
422 return -EADDRNOTAVAIL;
423 in_dev->cnf.mc_forwarding++;
424 dev_set_allmulti(dev, +1);
425 ip_rt_multicast_event(in_dev);
428 * Fill in the VIF structures
430 v->rate_limit=vifc->vifc_rate_limit;
431 v->local=vifc->vifc_lcl_addr.s_addr;
432 v->remote=vifc->vifc_rmt_addr.s_addr;
433 v->flags=vifc->vifc_flags;
435 v->flags |= VIFF_STATIC;
436 v->threshold=vifc->vifc_threshold;
441 v->link = dev->ifindex;
442 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
443 v->link = dev->iflink;
445 /* And finish update writing critical data */
446 write_lock_bh(&mrt_lock);
449 #ifdef CONFIG_IP_PIMSM
450 if (v->flags&VIFF_REGISTER)
455 write_unlock_bh(&mrt_lock);
459 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
461 int line=MFC_HASH(mcastgrp,origin);
464 for (c=mfc_cache_array[line]; c; c = c->next) {
465 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
472 * Allocate a multicast cache entry
474 static struct mfc_cache *ipmr_cache_alloc(void)
476 struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
479 memset(c, 0, sizeof(*c));
480 c->mfc_un.res.minvif = MAXVIFS;
484 static struct mfc_cache *ipmr_cache_alloc_unres(void)
486 struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
489 memset(c, 0, sizeof(*c));
490 skb_queue_head_init(&c->mfc_un.unres.unresolved);
491 c->mfc_un.unres.expires = jiffies + 10*HZ;
496 * A cache entry has gone into a resolved state from queued
499 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
504 * Play the pending entries through our router
507 while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
508 if (skb->nh.iph->version == 0) {
510 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
512 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
513 nlh->nlmsg_len = skb->tail - (u8*)nlh;
515 nlh->nlmsg_type = NLMSG_ERROR;
516 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517 skb_trim(skb, nlh->nlmsg_len);
518 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
520 err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
522 ip_mr_forward(skb, c, 0);
527 * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
528 * expects the following bizarre scheme.
530 * Called under mrt_lock.
533 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
536 int ihl = pkt->nh.iph->ihl<<2;
537 struct igmphdr *igmp;
541 #ifdef CONFIG_IP_PIMSM
542 if (assert == IGMPMSG_WHOLEPKT)
543 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
546 skb = alloc_skb(128, GFP_ATOMIC);
551 #ifdef CONFIG_IP_PIMSM
552 if (assert == IGMPMSG_WHOLEPKT) {
553 /* Ugly, but we have no choice with this interface.
554 Duplicate old header, fix ihl, length etc.
555 And all this only to mangle msg->im_msgtype and
556 to set msg->im_mbz to "mbz" :-)
558 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
559 skb->nh.raw = skb->h.raw = (u8*)msg;
560 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
561 msg->im_msgtype = IGMPMSG_WHOLEPKT;
563 msg->im_vif = reg_vif_num;
564 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
565 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
574 skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
575 memcpy(skb->data,pkt->data,ihl);
576 skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */
577 msg = (struct igmpmsg*)skb->nh.iph;
579 skb->dst = dst_clone(pkt->dst);
585 igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
587 msg->im_msgtype = assert;
589 skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */
590 skb->h.raw = skb->nh.raw;
593 if (mroute_socket == NULL) {
601 if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
603 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
611 * Queue a packet for resolution. It gets locked cache entry!
615 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
620 spin_lock_bh(&mfc_unres_lock);
621 for (c=mfc_unres_queue; c; c=c->next) {
622 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
623 c->mfc_origin == skb->nh.iph->saddr)
629 * Create a new entry if allowable
632 if (atomic_read(&cache_resolve_queue_len)>=10 ||
633 (c=ipmr_cache_alloc_unres())==NULL) {
634 spin_unlock_bh(&mfc_unres_lock);
641 * Fill in the new cache entry
644 c->mfc_origin=skb->nh.iph->saddr;
645 c->mfc_mcastgrp=skb->nh.iph->daddr;
648 * Reflect first query at mrouted.
650 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
651 /* If the report failed throw the cache entry
654 spin_unlock_bh(&mfc_unres_lock);
656 kmem_cache_free(mrt_cachep, c);
661 atomic_inc(&cache_resolve_queue_len);
662 c->next = mfc_unres_queue;
665 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
669 * See if we can append the packet
671 if (c->mfc_un.unres.unresolved.qlen>3) {
675 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
679 spin_unlock_bh(&mfc_unres_lock);
684 * MFC cache manipulation by user space mroute daemon
687 static int ipmr_mfc_delete(struct mfcctl *mfc)
690 struct mfc_cache *c, **cp;
692 line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
694 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
695 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
696 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
697 write_lock_bh(&mrt_lock);
699 write_unlock_bh(&mrt_lock);
701 kmem_cache_free(mrt_cachep, c);
708 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
711 struct mfc_cache *uc, *c, **cp;
713 line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
715 for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
722 write_lock_bh(&mrt_lock);
723 c->mfc_parent = mfc->mfcc_parent;
724 ipmr_update_threshoulds(c, mfc->mfcc_ttls);
726 c->mfc_flags |= MFC_STATIC;
727 write_unlock_bh(&mrt_lock);
731 if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
734 c=ipmr_cache_alloc();
738 c->mfc_origin=mfc->mfcc_origin.s_addr;
739 c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
740 c->mfc_parent=mfc->mfcc_parent;
741 ipmr_update_threshoulds(c, mfc->mfcc_ttls);
743 c->mfc_flags |= MFC_STATIC;
745 write_lock_bh(&mrt_lock);
746 c->next = mfc_cache_array[line];
747 mfc_cache_array[line] = c;
748 write_unlock_bh(&mrt_lock);
751 * Check to see if we resolved a queued list. If so we
752 * need to send on the frames and tidy up.
754 spin_lock_bh(&mfc_unres_lock);
755 for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
757 if (uc->mfc_origin == c->mfc_origin &&
758 uc->mfc_mcastgrp == c->mfc_mcastgrp) {
760 if (atomic_dec_and_test(&cache_resolve_queue_len))
761 del_timer(&ipmr_expire_timer);
765 spin_unlock_bh(&mfc_unres_lock);
768 ipmr_cache_resolve(uc, c);
769 kmem_cache_free(mrt_cachep, uc);
775 * Close the multicast socket, and clear the vif tables etc
778 static void mroute_clean_tables(struct sock *sk)
783 * Shut down all active vif entries
785 for(i=0; i<maxvif; i++) {
786 if (!(vif_table[i].flags&VIFF_STATIC))
793 for (i=0;i<MFC_LINES;i++) {
794 struct mfc_cache *c, **cp;
796 cp = &mfc_cache_array[i];
797 while ((c = *cp) != NULL) {
798 if (c->mfc_flags&MFC_STATIC) {
802 write_lock_bh(&mrt_lock);
804 write_unlock_bh(&mrt_lock);
806 kmem_cache_free(mrt_cachep, c);
810 if (atomic_read(&cache_resolve_queue_len) != 0) {
813 spin_lock_bh(&mfc_unres_lock);
814 while (mfc_unres_queue != NULL) {
816 mfc_unres_queue = c->next;
817 spin_unlock_bh(&mfc_unres_lock);
819 ipmr_destroy_unres(c);
821 spin_lock_bh(&mfc_unres_lock);
823 spin_unlock_bh(&mfc_unres_lock);
827 static void mrtsock_destruct(struct sock *sk)
830 if (sk == mroute_socket) {
831 ipv4_devconf.mc_forwarding--;
833 write_lock_bh(&mrt_lock);
835 write_unlock_bh(&mrt_lock);
837 mroute_clean_tables(sk);
843 * Socket options and virtual interface manipulation. The whole
844 * virtual interface system is a complete heap, but unfortunately
845 * that's how BSD mrouted happens to think. Maybe one day with a proper
846 * MOSPF/PIM router set up we can clean this up.
849 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
855 if(optname!=MRT_INIT)
857 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
864 if (sk->sk_type != SOCK_RAW ||
865 inet_sk(sk)->num != IPPROTO_IGMP)
867 if(optlen!=sizeof(int))
876 ret = ip_ra_control(sk, 1, mrtsock_destruct);
878 write_lock_bh(&mrt_lock);
880 write_unlock_bh(&mrt_lock);
882 ipv4_devconf.mc_forwarding++;
887 if (sk!=mroute_socket)
889 return ip_ra_control(sk, 0, NULL);
892 if(optlen!=sizeof(vif))
894 if (copy_from_user(&vif,optval,sizeof(vif)))
896 if(vif.vifc_vifi >= MAXVIFS)
899 if (optname==MRT_ADD_VIF) {
900 ret = vif_add(&vif, sk==mroute_socket);
902 ret = vif_delete(vif.vifc_vifi);
908 * Manipulate the forwarding caches. These live
909 * in a sort of kernel/user symbiosis.
913 if(optlen!=sizeof(mfc))
915 if (copy_from_user(&mfc,optval, sizeof(mfc)))
918 if (optname==MRT_DEL_MFC)
919 ret = ipmr_mfc_delete(&mfc);
921 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
925 * Control PIM assert.
930 if(get_user(v,(int __user *)optval))
932 mroute_do_assert=(v)?1:0;
935 #ifdef CONFIG_IP_PIMSM
939 if(get_user(v,(int __user *)optval))
944 if (v != mroute_do_pim) {
946 mroute_do_assert = v;
947 #ifdef CONFIG_IP_PIMSM_V2
949 ret = inet_add_protocol(&pim_protocol,
952 ret = inet_del_protocol(&pim_protocol,
963 * Spurious command, or MRT_VERSION which you cannot
972 * Getsock opt support for the multicast routing system.
975 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
980 if(optname!=MRT_VERSION &&
981 #ifdef CONFIG_IP_PIMSM
987 if (get_user(olr, optlen))
990 olr = min_t(unsigned int, olr, sizeof(int));
994 if(put_user(olr,optlen))
996 if(optname==MRT_VERSION)
998 #ifdef CONFIG_IP_PIMSM
999 else if(optname==MRT_PIM)
1003 val=mroute_do_assert;
1004 if(copy_to_user(optval,&val,olr))
1010 * The IP multicast ioctl support routines.
1013 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1015 struct sioc_sg_req sr;
1016 struct sioc_vif_req vr;
1017 struct vif_device *vif;
1018 struct mfc_cache *c;
1023 if (copy_from_user(&vr,arg,sizeof(vr)))
1027 read_lock(&mrt_lock);
1028 vif=&vif_table[vr.vifi];
1029 if(VIF_EXISTS(vr.vifi)) {
1030 vr.icount=vif->pkt_in;
1031 vr.ocount=vif->pkt_out;
1032 vr.ibytes=vif->bytes_in;
1033 vr.obytes=vif->bytes_out;
1034 read_unlock(&mrt_lock);
1036 if (copy_to_user(arg,&vr,sizeof(vr)))
1040 read_unlock(&mrt_lock);
1041 return -EADDRNOTAVAIL;
1043 if (copy_from_user(&sr,arg,sizeof(sr)))
1046 read_lock(&mrt_lock);
1047 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1049 sr.pktcnt = c->mfc_un.res.pkt;
1050 sr.bytecnt = c->mfc_un.res.bytes;
1051 sr.wrong_if = c->mfc_un.res.wrong_if;
1052 read_unlock(&mrt_lock);
1054 if (copy_to_user(arg,&sr,sizeof(sr)))
1058 read_unlock(&mrt_lock);
1059 return -EADDRNOTAVAIL;
1061 return -ENOIOCTLCMD;
1066 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1068 struct vif_device *v;
1070 if (event != NETDEV_UNREGISTER)
1073 for(ct=0;ct<maxvif;ct++,v++) {
1081 static struct notifier_block ip_mr_notifier={
1082 .notifier_call = ipmr_device_event,
1086 * Encapsulate a packet by attaching a valid IPIP header to it.
1087 * This avoids tunnel drivers and other mess and gives us the speed so
1088 * important for multicast video.
1091 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1093 struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1096 iph->tos = skb->nh.iph->tos;
1097 iph->ttl = skb->nh.iph->ttl;
1101 iph->protocol = IPPROTO_IPIP;
1103 iph->tot_len = htons(skb->len);
1104 ip_select_ident(iph, skb->dst, NULL);
1107 skb->h.ipiph = skb->nh.iph;
1109 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1113 static inline int ipmr_forward_finish(struct sk_buff *skb)
1115 struct ip_options * opt = &(IPCB(skb)->opt);
1117 IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1119 if (unlikely(opt->optlen))
1120 ip_forward_options(skb);
1122 return dst_output(skb);
1126 * Processing handlers for ipmr_forward
1129 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1131 struct iphdr *iph = skb->nh.iph;
1132 struct vif_device *vif = &vif_table[vifi];
1133 struct net_device *dev;
1137 if (vif->dev == NULL)
1140 #ifdef CONFIG_IP_PIMSM
1141 if (vif->flags & VIFF_REGISTER) {
1143 vif->bytes_out+=skb->len;
1144 ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1145 ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1146 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1152 if (vif->flags&VIFF_TUNNEL) {
1153 struct flowi fl = { .oif = vif->link,
1155 { .daddr = vif->remote,
1156 .saddr = vif->local,
1157 .tos = RT_TOS(iph->tos) } },
1158 .proto = IPPROTO_IPIP };
1159 if (ip_route_output_key(&rt, &fl))
1161 encap = sizeof(struct iphdr);
1163 struct flowi fl = { .oif = vif->link,
1165 { .daddr = iph->daddr,
1166 .tos = RT_TOS(iph->tos) } },
1167 .proto = IPPROTO_IPIP };
1168 if (ip_route_output_key(&rt, &fl))
1172 dev = rt->u.dst.dev;
1174 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1175 /* Do not fragment multicasts. Alas, IPv4 does not
1176 allow to send ICMP, so that packets will disappear
1180 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1185 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1187 if (skb_cow(skb, encap)) {
1193 vif->bytes_out+=skb->len;
1195 dst_release(skb->dst);
1196 skb->dst = &rt->u.dst;
1198 ip_decrease_ttl(iph);
1200 /* FIXME: forward and output firewalls used to be called here.
1201 * What do we do with netfilter? -- RR */
1202 if (vif->flags & VIFF_TUNNEL) {
1203 ip_encap(skb, vif->local, vif->remote);
1204 /* FIXME: extra output firewall step used to be here. --RR */
1205 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1206 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1209 IPCB(skb)->flags |= IPSKB_FORWARDED;
1212 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1213 * not only before forwarding, but after forwarding on all output
1214 * interfaces. It is clear, if mrouter runs a multicasting
1215 * program, it should receive packets not depending to what interface
1216 * program is joined.
1217 * If we will not make it, the program will have to join on all
1218 * interfaces. On the other hand, multihoming host (or router, but
1219 * not mrouter) cannot join to more than one interface - it will
1220 * result in receiving multiple packets.
1222 NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1223 ipmr_forward_finish);
1231 static int ipmr_find_vif(struct net_device *dev)
1234 for (ct=maxvif-1; ct>=0; ct--) {
1235 if (vif_table[ct].dev == dev)
1241 /* "local" means that we should preserve one skb (for local delivery) */
1243 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1248 vif = cache->mfc_parent;
1249 cache->mfc_un.res.pkt++;
1250 cache->mfc_un.res.bytes += skb->len;
1253 * Wrong interface: drop packet and (maybe) send PIM assert.
1255 if (vif_table[vif].dev != skb->dev) {
1258 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1259 /* It is our own packet, looped back.
1260 Very complicated situation...
1262 The best workaround until routing daemons will be
1263 fixed is not to redistribute packet, if it was
1264 send through wrong interface. It means, that
1265 multicast applications WILL NOT work for
1266 (S,G), which have default multicast route pointing
1267 to wrong oif. In any case, it is not a good
1268 idea to use multicasting applications on router.
1273 cache->mfc_un.res.wrong_if++;
1274 true_vifi = ipmr_find_vif(skb->dev);
1276 if (true_vifi >= 0 && mroute_do_assert &&
1277 /* pimsm uses asserts, when switching from RPT to SPT,
1278 so that we cannot check that packet arrived on an oif.
1279 It is bad, but otherwise we would need to move pretty
1280 large chunk of pimd to kernel. Ough... --ANK
1282 (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1284 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1285 cache->mfc_un.res.last_assert = jiffies;
1286 ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1291 vif_table[vif].pkt_in++;
1292 vif_table[vif].bytes_in+=skb->len;
1297 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1298 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1300 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1302 ipmr_queue_xmit(skb2, cache, psend);
1309 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1311 ipmr_queue_xmit(skb2, cache, psend);
1313 ipmr_queue_xmit(skb, cache, psend);
1326 * Multicast packets for forwarding arrive here
1329 int ip_mr_input(struct sk_buff *skb)
1331 struct mfc_cache *cache;
1332 int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1334 /* Packet is looped back after forward, it should not be
1335 forwarded second time, but still can be delivered locally.
1337 if (IPCB(skb)->flags&IPSKB_FORWARDED)
1341 if (IPCB(skb)->opt.router_alert) {
1342 if (ip_call_ra_chain(skb))
1344 } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1345 /* IGMPv1 (and broken IGMPv2 implementations sort of
1346 Cisco IOS <= 11.2(8)) do not put router alert
1347 option to IGMP packets destined to routable
1348 groups. It is very bad, because it means
1349 that we can forward NO IGMP messages.
1351 read_lock(&mrt_lock);
1352 if (mroute_socket) {
1354 raw_rcv(mroute_socket, skb);
1355 read_unlock(&mrt_lock);
1358 read_unlock(&mrt_lock);
1362 read_lock(&mrt_lock);
1363 cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1366 * No usable cache entry
1372 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1373 ip_local_deliver(skb);
1375 read_unlock(&mrt_lock);
1381 vif = ipmr_find_vif(skb->dev);
1383 int err = ipmr_cache_unresolved(vif, skb);
1384 read_unlock(&mrt_lock);
1388 read_unlock(&mrt_lock);
1393 ip_mr_forward(skb, cache, local);
1395 read_unlock(&mrt_lock);
1398 return ip_local_deliver(skb);
1404 return ip_local_deliver(skb);
1409 #ifdef CONFIG_IP_PIMSM_V1
1411 * Handle IGMP messages of PIMv1
1414 int pim_rcv_v1(struct sk_buff * skb)
1416 struct igmphdr *pim;
1417 struct iphdr *encap;
1418 struct net_device *reg_dev = NULL;
1420 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1423 pim = (struct igmphdr*)skb->h.raw;
1425 if (!mroute_do_pim ||
1426 skb->len < sizeof(*pim) + sizeof(*encap) ||
1427 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1430 encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1433 a. packet is really destinted to a multicast group
1434 b. packet is not a NULL-REGISTER
1435 c. packet is not truncated
1437 if (!MULTICAST(encap->daddr) ||
1438 encap->tot_len == 0 ||
1439 ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1442 read_lock(&mrt_lock);
1443 if (reg_vif_num >= 0)
1444 reg_dev = vif_table[reg_vif_num].dev;
1447 read_unlock(&mrt_lock);
1449 if (reg_dev == NULL)
1452 skb->mac.raw = skb->nh.raw;
1453 skb_pull(skb, (u8*)encap - skb->data);
1454 skb->nh.iph = (struct iphdr *)skb->data;
1456 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1457 skb->protocol = htons(ETH_P_IP);
1459 skb->pkt_type = PACKET_HOST;
1460 dst_release(skb->dst);
1462 ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1463 ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1474 #ifdef CONFIG_IP_PIMSM_V2
1475 static int pim_rcv(struct sk_buff * skb)
1477 struct pimreghdr *pim;
1478 struct iphdr *encap;
1479 struct net_device *reg_dev = NULL;
1481 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1484 pim = (struct pimreghdr*)skb->h.raw;
1485 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1486 (pim->flags&PIM_NULL_REGISTER) ||
1487 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1488 (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1491 /* check if the inner packet is destined to mcast group */
1492 encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1493 if (!MULTICAST(encap->daddr) ||
1494 encap->tot_len == 0 ||
1495 ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1498 read_lock(&mrt_lock);
1499 if (reg_vif_num >= 0)
1500 reg_dev = vif_table[reg_vif_num].dev;
1503 read_unlock(&mrt_lock);
1505 if (reg_dev == NULL)
1508 skb->mac.raw = skb->nh.raw;
1509 skb_pull(skb, (u8*)encap - skb->data);
1510 skb->nh.iph = (struct iphdr *)skb->data;
1512 memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1513 skb->protocol = htons(ETH_P_IP);
1515 skb->pkt_type = PACKET_HOST;
1516 dst_release(skb->dst);
1517 ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1518 ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1531 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1534 struct rtnexthop *nhp;
1535 struct net_device *dev = vif_table[c->mfc_parent].dev;
1537 struct rtattr *mp_head;
1540 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1542 mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1544 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1545 if (c->mfc_un.res.ttls[ct] < 255) {
1546 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1547 goto rtattr_failure;
1548 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1549 nhp->rtnh_flags = 0;
1550 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1551 nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1552 nhp->rtnh_len = sizeof(*nhp);
1555 mp_head->rta_type = RTA_MULTIPATH;
1556 mp_head->rta_len = skb->tail - (u8*)mp_head;
1557 rtm->rtm_type = RTN_MULTICAST;
1561 skb_trim(skb, b - skb->data);
1565 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1568 struct mfc_cache *cache;
1569 struct rtable *rt = (struct rtable*)skb->dst;
1571 read_lock(&mrt_lock);
1572 cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1575 struct net_device *dev;
1579 read_unlock(&mrt_lock);
1584 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1585 read_unlock(&mrt_lock);
1588 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1589 skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1590 skb->nh.iph->saddr = rt->rt_src;
1591 skb->nh.iph->daddr = rt->rt_dst;
1592 skb->nh.iph->version = 0;
1593 err = ipmr_cache_unresolved(vif, skb);
1594 read_unlock(&mrt_lock);
1598 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1599 cache->mfc_flags |= MFC_NOTIFY;
1600 err = ipmr_fill_mroute(skb, cache, rtm);
1601 read_unlock(&mrt_lock);
1605 #ifdef CONFIG_PROC_FS
1607 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1609 struct ipmr_vif_iter {
1613 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1616 for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1617 if(!VIF_EXISTS(iter->ct))
1620 return &vif_table[iter->ct];
1625 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1627 read_lock(&mrt_lock);
1628 return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1632 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1634 struct ipmr_vif_iter *iter = seq->private;
1637 if (v == SEQ_START_TOKEN)
1638 return ipmr_vif_seq_idx(iter, 0);
1640 while (++iter->ct < maxvif) {
1641 if(!VIF_EXISTS(iter->ct))
1643 return &vif_table[iter->ct];
1648 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1650 read_unlock(&mrt_lock);
1653 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1655 if (v == SEQ_START_TOKEN) {
1657 "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
1659 const struct vif_device *vif = v;
1660 const char *name = vif->dev ? vif->dev->name : "none";
1663 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
1665 name, vif->bytes_in, vif->pkt_in,
1666 vif->bytes_out, vif->pkt_out,
1667 vif->flags, vif->local, vif->remote);
1672 static struct seq_operations ipmr_vif_seq_ops = {
1673 .start = ipmr_vif_seq_start,
1674 .next = ipmr_vif_seq_next,
1675 .stop = ipmr_vif_seq_stop,
1676 .show = ipmr_vif_seq_show,
1679 static int ipmr_vif_open(struct inode *inode, struct file *file)
1681 struct seq_file *seq;
1683 struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1688 rc = seq_open(file, &ipmr_vif_seq_ops);
1693 seq = file->private_data;
1703 static struct file_operations ipmr_vif_fops = {
1704 .owner = THIS_MODULE,
1705 .open = ipmr_vif_open,
1707 .llseek = seq_lseek,
1708 .release = seq_release_private,
1711 struct ipmr_mfc_iter {
1712 struct mfc_cache **cache;
1717 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1719 struct mfc_cache *mfc;
1721 it->cache = mfc_cache_array;
1722 read_lock(&mrt_lock);
1723 for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1724 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1727 read_unlock(&mrt_lock);
1729 it->cache = &mfc_unres_queue;
1730 spin_lock_bh(&mfc_unres_lock);
1731 for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1734 spin_unlock_bh(&mfc_unres_lock);
1741 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1743 struct ipmr_mfc_iter *it = seq->private;
1746 return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1750 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1752 struct mfc_cache *mfc = v;
1753 struct ipmr_mfc_iter *it = seq->private;
1757 if (v == SEQ_START_TOKEN)
1758 return ipmr_mfc_seq_idx(seq->private, 0);
1763 if (it->cache == &mfc_unres_queue)
1766 BUG_ON(it->cache != mfc_cache_array);
1768 while (++it->ct < MFC_LINES) {
1769 mfc = mfc_cache_array[it->ct];
1774 /* exhausted cache_array, show unresolved */
1775 read_unlock(&mrt_lock);
1776 it->cache = &mfc_unres_queue;
1779 spin_lock_bh(&mfc_unres_lock);
1780 mfc = mfc_unres_queue;
1785 spin_unlock_bh(&mfc_unres_lock);
1791 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1793 struct ipmr_mfc_iter *it = seq->private;
1795 if (it->cache == &mfc_unres_queue)
1796 spin_unlock_bh(&mfc_unres_lock);
1797 else if (it->cache == mfc_cache_array)
1798 read_unlock(&mrt_lock);
1801 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1805 if (v == SEQ_START_TOKEN) {
1807 "Group Origin Iif Pkts Bytes Wrong Oifs\n");
1809 const struct mfc_cache *mfc = v;
1810 const struct ipmr_mfc_iter *it = seq->private;
1812 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1813 (unsigned long) mfc->mfc_mcastgrp,
1814 (unsigned long) mfc->mfc_origin,
1816 mfc->mfc_un.res.pkt,
1817 mfc->mfc_un.res.bytes,
1818 mfc->mfc_un.res.wrong_if);
1820 if (it->cache != &mfc_unres_queue) {
1821 for(n = mfc->mfc_un.res.minvif;
1822 n < mfc->mfc_un.res.maxvif; n++ ) {
1824 && mfc->mfc_un.res.ttls[n] < 255)
1827 n, mfc->mfc_un.res.ttls[n]);
1830 seq_putc(seq, '\n');
1835 static struct seq_operations ipmr_mfc_seq_ops = {
1836 .start = ipmr_mfc_seq_start,
1837 .next = ipmr_mfc_seq_next,
1838 .stop = ipmr_mfc_seq_stop,
1839 .show = ipmr_mfc_seq_show,
1842 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1844 struct seq_file *seq;
1846 struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1851 rc = seq_open(file, &ipmr_mfc_seq_ops);
1855 seq = file->private_data;
1865 static struct file_operations ipmr_mfc_fops = {
1866 .owner = THIS_MODULE,
1867 .open = ipmr_mfc_open,
1869 .llseek = seq_lseek,
1870 .release = seq_release_private,
1874 #ifdef CONFIG_IP_PIMSM_V2
1875 static struct net_protocol pim_protocol = {
1882 * Setup for IP multicast routing
1885 void __init ip_mr_init(void)
1887 mrt_cachep = kmem_cache_create("ip_mrt_cache",
1888 sizeof(struct mfc_cache),
1889 0, SLAB_HWCACHE_ALIGN,
1892 panic("cannot allocate ip_mrt_cache");
1894 init_timer(&ipmr_expire_timer);
1895 ipmr_expire_timer.function=ipmr_expire_process;
1896 register_netdevice_notifier(&ip_mr_notifier);
1897 #ifdef CONFIG_PROC_FS
1898 proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1899 proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);