]> git.karo-electronics.de Git - mv-sheeva.git/blob - net/ipv4/ipmr.c
ipmr: __pim_rcv() is called under rcu_read_lock
[mv-sheeva.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requirement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <linux/slab.h>
51 #include <net/net_namespace.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 #include <net/fib_rules.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 struct mr_table {
73         struct list_head        list;
74 #ifdef CONFIG_NET_NS
75         struct net              *net;
76 #endif
77         u32                     id;
78         struct sock             *mroute_sk;
79         struct timer_list       ipmr_expire_timer;
80         struct list_head        mfc_unres_queue;
81         struct list_head        mfc_cache_array[MFC_LINES];
82         struct vif_device       vif_table[MAXVIFS];
83         int                     maxvif;
84         atomic_t                cache_resolve_queue_len;
85         int                     mroute_do_assert;
86         int                     mroute_do_pim;
87 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88         int                     mroute_reg_vif_num;
89 #endif
90 };
91
92 struct ipmr_rule {
93         struct fib_rule         common;
94 };
95
96 struct ipmr_result {
97         struct mr_table         *mrt;
98 };
99
100 /* Big lock, protecting vif table, mrt cache and mroute socket state.
101    Note that the changes are semaphored via rtnl_lock.
102  */
103
104 static DEFINE_RWLOCK(mrt_lock);
105
106 /*
107  *      Multicast router control variables
108  */
109
110 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
111
112 /* Special spinlock for queue of unresolved entries */
113 static DEFINE_SPINLOCK(mfc_unres_lock);
114
115 /* We return to original Alan's scheme. Hash table of resolved
116    entries is changed only in process context and protected
117    with weak lock mrt_lock. Queue of unresolved entries is protected
118    with strong spinlock mfc_unres_lock.
119
120    In this case data path is free of exclusive locks at all.
121  */
122
123 static struct kmem_cache *mrt_cachep __read_mostly;
124
125 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
126 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127                          struct sk_buff *skb, struct mfc_cache *cache,
128                          int local);
129 static int ipmr_cache_report(struct mr_table *mrt,
130                              struct sk_buff *pkt, vifi_t vifi, int assert);
131 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132                               struct mfc_cache *c, struct rtmsg *rtm);
133 static void ipmr_expire_process(unsigned long arg);
134
135 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136 #define ipmr_for_each_table(mrt, net) \
137         list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138
139 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140 {
141         struct mr_table *mrt;
142
143         ipmr_for_each_table(mrt, net) {
144                 if (mrt->id == id)
145                         return mrt;
146         }
147         return NULL;
148 }
149
150 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151                            struct mr_table **mrt)
152 {
153         struct ipmr_result res;
154         struct fib_lookup_arg arg = { .result = &res, };
155         int err;
156
157         err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158         if (err < 0)
159                 return err;
160         *mrt = res.mrt;
161         return 0;
162 }
163
164 static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165                             int flags, struct fib_lookup_arg *arg)
166 {
167         struct ipmr_result *res = arg->result;
168         struct mr_table *mrt;
169
170         switch (rule->action) {
171         case FR_ACT_TO_TBL:
172                 break;
173         case FR_ACT_UNREACHABLE:
174                 return -ENETUNREACH;
175         case FR_ACT_PROHIBIT:
176                 return -EACCES;
177         case FR_ACT_BLACKHOLE:
178         default:
179                 return -EINVAL;
180         }
181
182         mrt = ipmr_get_table(rule->fr_net, rule->table);
183         if (mrt == NULL)
184                 return -EAGAIN;
185         res->mrt = mrt;
186         return 0;
187 }
188
189 static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190 {
191         return 1;
192 }
193
194 static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195         FRA_GENERIC_POLICY,
196 };
197
198 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199                                struct fib_rule_hdr *frh, struct nlattr **tb)
200 {
201         return 0;
202 }
203
204 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205                              struct nlattr **tb)
206 {
207         return 1;
208 }
209
210 static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211                           struct fib_rule_hdr *frh)
212 {
213         frh->dst_len = 0;
214         frh->src_len = 0;
215         frh->tos     = 0;
216         return 0;
217 }
218
219 static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220         .family         = RTNL_FAMILY_IPMR,
221         .rule_size      = sizeof(struct ipmr_rule),
222         .addr_size      = sizeof(u32),
223         .action         = ipmr_rule_action,
224         .match          = ipmr_rule_match,
225         .configure      = ipmr_rule_configure,
226         .compare        = ipmr_rule_compare,
227         .default_pref   = fib_default_rule_pref,
228         .fill           = ipmr_rule_fill,
229         .nlgroup        = RTNLGRP_IPV4_RULE,
230         .policy         = ipmr_rule_policy,
231         .owner          = THIS_MODULE,
232 };
233
234 static int __net_init ipmr_rules_init(struct net *net)
235 {
236         struct fib_rules_ops *ops;
237         struct mr_table *mrt;
238         int err;
239
240         ops = fib_rules_register(&ipmr_rules_ops_template, net);
241         if (IS_ERR(ops))
242                 return PTR_ERR(ops);
243
244         INIT_LIST_HEAD(&net->ipv4.mr_tables);
245
246         mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247         if (mrt == NULL) {
248                 err = -ENOMEM;
249                 goto err1;
250         }
251
252         err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253         if (err < 0)
254                 goto err2;
255
256         net->ipv4.mr_rules_ops = ops;
257         return 0;
258
259 err2:
260         kfree(mrt);
261 err1:
262         fib_rules_unregister(ops);
263         return err;
264 }
265
266 static void __net_exit ipmr_rules_exit(struct net *net)
267 {
268         struct mr_table *mrt, *next;
269
270         list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271                 list_del(&mrt->list);
272                 kfree(mrt);
273         }
274         fib_rules_unregister(net->ipv4.mr_rules_ops);
275 }
276 #else
277 #define ipmr_for_each_table(mrt, net) \
278         for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279
280 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281 {
282         return net->ipv4.mrt;
283 }
284
285 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286                            struct mr_table **mrt)
287 {
288         *mrt = net->ipv4.mrt;
289         return 0;
290 }
291
292 static int __net_init ipmr_rules_init(struct net *net)
293 {
294         net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295         return net->ipv4.mrt ? 0 : -ENOMEM;
296 }
297
298 static void __net_exit ipmr_rules_exit(struct net *net)
299 {
300         kfree(net->ipv4.mrt);
301 }
302 #endif
303
304 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305 {
306         struct mr_table *mrt;
307         unsigned int i;
308
309         mrt = ipmr_get_table(net, id);
310         if (mrt != NULL)
311                 return mrt;
312
313         mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314         if (mrt == NULL)
315                 return NULL;
316         write_pnet(&mrt->net, net);
317         mrt->id = id;
318
319         /* Forwarding cache */
320         for (i = 0; i < MFC_LINES; i++)
321                 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322
323         INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324
325         setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326                     (unsigned long)mrt);
327
328 #ifdef CONFIG_IP_PIMSM
329         mrt->mroute_reg_vif_num = -1;
330 #endif
331 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332         list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333 #endif
334         return mrt;
335 }
336
337 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
338
339 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
340 {
341         struct net *net = dev_net(dev);
342
343         dev_close(dev);
344
345         dev = __dev_get_by_name(net, "tunl0");
346         if (dev) {
347                 const struct net_device_ops *ops = dev->netdev_ops;
348                 struct ifreq ifr;
349                 struct ip_tunnel_parm p;
350
351                 memset(&p, 0, sizeof(p));
352                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
353                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
354                 p.iph.version = 4;
355                 p.iph.ihl = 5;
356                 p.iph.protocol = IPPROTO_IPIP;
357                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
358                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
359
360                 if (ops->ndo_do_ioctl) {
361                         mm_segment_t oldfs = get_fs();
362
363                         set_fs(KERNEL_DS);
364                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
365                         set_fs(oldfs);
366                 }
367         }
368 }
369
370 static
371 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
372 {
373         struct net_device  *dev;
374
375         dev = __dev_get_by_name(net, "tunl0");
376
377         if (dev) {
378                 const struct net_device_ops *ops = dev->netdev_ops;
379                 int err;
380                 struct ifreq ifr;
381                 struct ip_tunnel_parm p;
382                 struct in_device  *in_dev;
383
384                 memset(&p, 0, sizeof(p));
385                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
386                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
387                 p.iph.version = 4;
388                 p.iph.ihl = 5;
389                 p.iph.protocol = IPPROTO_IPIP;
390                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
391                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
392
393                 if (ops->ndo_do_ioctl) {
394                         mm_segment_t oldfs = get_fs();
395
396                         set_fs(KERNEL_DS);
397                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398                         set_fs(oldfs);
399                 } else
400                         err = -EOPNOTSUPP;
401
402                 dev = NULL;
403
404                 if (err == 0 &&
405                     (dev = __dev_get_by_name(net, p.name)) != NULL) {
406                         dev->flags |= IFF_MULTICAST;
407
408                         in_dev = __in_dev_get_rtnl(dev);
409                         if (in_dev == NULL)
410                                 goto failure;
411
412                         ipv4_devconf_setall(in_dev);
413                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
414
415                         if (dev_open(dev))
416                                 goto failure;
417                         dev_hold(dev);
418                 }
419         }
420         return dev;
421
422 failure:
423         /* allow the register to be completed before unregistering. */
424         rtnl_unlock();
425         rtnl_lock();
426
427         unregister_netdevice(dev);
428         return NULL;
429 }
430
431 #ifdef CONFIG_IP_PIMSM
432
433 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434 {
435         struct net *net = dev_net(dev);
436         struct mr_table *mrt;
437         struct flowi fl = {
438                 .oif            = dev->ifindex,
439                 .iif            = skb->skb_iif,
440                 .mark           = skb->mark,
441         };
442         int err;
443
444         err = ipmr_fib_lookup(net, &fl, &mrt);
445         if (err < 0) {
446                 kfree_skb(skb);
447                 return err;
448         }
449
450         read_lock(&mrt_lock);
451         dev->stats.tx_bytes += skb->len;
452         dev->stats.tx_packets++;
453         ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
454         read_unlock(&mrt_lock);
455         kfree_skb(skb);
456         return NETDEV_TX_OK;
457 }
458
459 static const struct net_device_ops reg_vif_netdev_ops = {
460         .ndo_start_xmit = reg_vif_xmit,
461 };
462
463 static void reg_vif_setup(struct net_device *dev)
464 {
465         dev->type               = ARPHRD_PIMREG;
466         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
467         dev->flags              = IFF_NOARP;
468         dev->netdev_ops         = &reg_vif_netdev_ops,
469         dev->destructor         = free_netdev;
470         dev->features           |= NETIF_F_NETNS_LOCAL;
471 }
472
473 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
474 {
475         struct net_device *dev;
476         struct in_device *in_dev;
477         char name[IFNAMSIZ];
478
479         if (mrt->id == RT_TABLE_DEFAULT)
480                 sprintf(name, "pimreg");
481         else
482                 sprintf(name, "pimreg%u", mrt->id);
483
484         dev = alloc_netdev(0, name, reg_vif_setup);
485
486         if (dev == NULL)
487                 return NULL;
488
489         dev_net_set(dev, net);
490
491         if (register_netdevice(dev)) {
492                 free_netdev(dev);
493                 return NULL;
494         }
495         dev->iflink = 0;
496
497         rcu_read_lock();
498         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
499                 rcu_read_unlock();
500                 goto failure;
501         }
502
503         ipv4_devconf_setall(in_dev);
504         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
505         rcu_read_unlock();
506
507         if (dev_open(dev))
508                 goto failure;
509
510         dev_hold(dev);
511
512         return dev;
513
514 failure:
515         /* allow the register to be completed before unregistering. */
516         rtnl_unlock();
517         rtnl_lock();
518
519         unregister_netdevice(dev);
520         return NULL;
521 }
522 #endif
523
524 /*
525  *      Delete a VIF entry
526  *      @notify: Set to 1, if the caller is a notifier_call
527  */
528
529 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
530                       struct list_head *head)
531 {
532         struct vif_device *v;
533         struct net_device *dev;
534         struct in_device *in_dev;
535
536         if (vifi < 0 || vifi >= mrt->maxvif)
537                 return -EADDRNOTAVAIL;
538
539         v = &mrt->vif_table[vifi];
540
541         write_lock_bh(&mrt_lock);
542         dev = v->dev;
543         v->dev = NULL;
544
545         if (!dev) {
546                 write_unlock_bh(&mrt_lock);
547                 return -EADDRNOTAVAIL;
548         }
549
550 #ifdef CONFIG_IP_PIMSM
551         if (vifi == mrt->mroute_reg_vif_num)
552                 mrt->mroute_reg_vif_num = -1;
553 #endif
554
555         if (vifi+1 == mrt->maxvif) {
556                 int tmp;
557                 for (tmp=vifi-1; tmp>=0; tmp--) {
558                         if (VIF_EXISTS(mrt, tmp))
559                                 break;
560                 }
561                 mrt->maxvif = tmp+1;
562         }
563
564         write_unlock_bh(&mrt_lock);
565
566         dev_set_allmulti(dev, -1);
567
568         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
569                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570                 ip_rt_multicast_event(in_dev);
571         }
572
573         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
574                 unregister_netdevice_queue(dev, head);
575
576         dev_put(dev);
577         return 0;
578 }
579
580 static inline void ipmr_cache_free(struct mfc_cache *c)
581 {
582         kmem_cache_free(mrt_cachep, c);
583 }
584
585 /* Destroy an unresolved cache entry, killing queued skbs
586    and reporting error to netlink readers.
587  */
588
589 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
590 {
591         struct net *net = read_pnet(&mrt->net);
592         struct sk_buff *skb;
593         struct nlmsgerr *e;
594
595         atomic_dec(&mrt->cache_resolve_queue_len);
596
597         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
598                 if (ip_hdr(skb)->version == 0) {
599                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
600                         nlh->nlmsg_type = NLMSG_ERROR;
601                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
602                         skb_trim(skb, nlh->nlmsg_len);
603                         e = NLMSG_DATA(nlh);
604                         e->error = -ETIMEDOUT;
605                         memset(&e->msg, 0, sizeof(e->msg));
606
607                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
608                 } else
609                         kfree_skb(skb);
610         }
611
612         ipmr_cache_free(c);
613 }
614
615
616 /* Timer process for the unresolved queue. */
617
618 static void ipmr_expire_process(unsigned long arg)
619 {
620         struct mr_table *mrt = (struct mr_table *)arg;
621         unsigned long now;
622         unsigned long expires;
623         struct mfc_cache *c, *next;
624
625         if (!spin_trylock(&mfc_unres_lock)) {
626                 mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
627                 return;
628         }
629
630         if (list_empty(&mrt->mfc_unres_queue))
631                 goto out;
632
633         now = jiffies;
634         expires = 10*HZ;
635
636         list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
637                 if (time_after(c->mfc_un.unres.expires, now)) {
638                         unsigned long interval = c->mfc_un.unres.expires - now;
639                         if (interval < expires)
640                                 expires = interval;
641                         continue;
642                 }
643
644                 list_del(&c->list);
645                 ipmr_destroy_unres(mrt, c);
646         }
647
648         if (!list_empty(&mrt->mfc_unres_queue))
649                 mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
650
651 out:
652         spin_unlock(&mfc_unres_lock);
653 }
654
655 /* Fill oifs list. It is called under write locked mrt_lock. */
656
657 static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
658                                    unsigned char *ttls)
659 {
660         int vifi;
661
662         cache->mfc_un.res.minvif = MAXVIFS;
663         cache->mfc_un.res.maxvif = 0;
664         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
665
666         for (vifi = 0; vifi < mrt->maxvif; vifi++) {
667                 if (VIF_EXISTS(mrt, vifi) &&
668                     ttls[vifi] && ttls[vifi] < 255) {
669                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
670                         if (cache->mfc_un.res.minvif > vifi)
671                                 cache->mfc_un.res.minvif = vifi;
672                         if (cache->mfc_un.res.maxvif <= vifi)
673                                 cache->mfc_un.res.maxvif = vifi + 1;
674                 }
675         }
676 }
677
678 static int vif_add(struct net *net, struct mr_table *mrt,
679                    struct vifctl *vifc, int mrtsock)
680 {
681         int vifi = vifc->vifc_vifi;
682         struct vif_device *v = &mrt->vif_table[vifi];
683         struct net_device *dev;
684         struct in_device *in_dev;
685         int err;
686
687         /* Is vif busy ? */
688         if (VIF_EXISTS(mrt, vifi))
689                 return -EADDRINUSE;
690
691         switch (vifc->vifc_flags) {
692 #ifdef CONFIG_IP_PIMSM
693         case VIFF_REGISTER:
694                 /*
695                  * Special Purpose VIF in PIM
696                  * All the packets will be sent to the daemon
697                  */
698                 if (mrt->mroute_reg_vif_num >= 0)
699                         return -EADDRINUSE;
700                 dev = ipmr_reg_vif(net, mrt);
701                 if (!dev)
702                         return -ENOBUFS;
703                 err = dev_set_allmulti(dev, 1);
704                 if (err) {
705                         unregister_netdevice(dev);
706                         dev_put(dev);
707                         return err;
708                 }
709                 break;
710 #endif
711         case VIFF_TUNNEL:
712                 dev = ipmr_new_tunnel(net, vifc);
713                 if (!dev)
714                         return -ENOBUFS;
715                 err = dev_set_allmulti(dev, 1);
716                 if (err) {
717                         ipmr_del_tunnel(dev, vifc);
718                         dev_put(dev);
719                         return err;
720                 }
721                 break;
722
723         case VIFF_USE_IFINDEX:
724         case 0:
725                 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
726                         dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
727                         if (dev && __in_dev_get_rtnl(dev) == NULL) {
728                                 dev_put(dev);
729                                 return -EADDRNOTAVAIL;
730                         }
731                 } else
732                         dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
733
734                 if (!dev)
735                         return -EADDRNOTAVAIL;
736                 err = dev_set_allmulti(dev, 1);
737                 if (err) {
738                         dev_put(dev);
739                         return err;
740                 }
741                 break;
742         default:
743                 return -EINVAL;
744         }
745
746         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
747                 dev_put(dev);
748                 return -EADDRNOTAVAIL;
749         }
750         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
751         ip_rt_multicast_event(in_dev);
752
753         /*
754          *      Fill in the VIF structures
755          */
756         v->rate_limit = vifc->vifc_rate_limit;
757         v->local = vifc->vifc_lcl_addr.s_addr;
758         v->remote = vifc->vifc_rmt_addr.s_addr;
759         v->flags = vifc->vifc_flags;
760         if (!mrtsock)
761                 v->flags |= VIFF_STATIC;
762         v->threshold = vifc->vifc_threshold;
763         v->bytes_in = 0;
764         v->bytes_out = 0;
765         v->pkt_in = 0;
766         v->pkt_out = 0;
767         v->link = dev->ifindex;
768         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
769                 v->link = dev->iflink;
770
771         /* And finish update writing critical data */
772         write_lock_bh(&mrt_lock);
773         v->dev = dev;
774 #ifdef CONFIG_IP_PIMSM
775         if (v->flags&VIFF_REGISTER)
776                 mrt->mroute_reg_vif_num = vifi;
777 #endif
778         if (vifi+1 > mrt->maxvif)
779                 mrt->maxvif = vifi+1;
780         write_unlock_bh(&mrt_lock);
781         return 0;
782 }
783
784 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
785                                          __be32 origin,
786                                          __be32 mcastgrp)
787 {
788         int line = MFC_HASH(mcastgrp, origin);
789         struct mfc_cache *c;
790
791         list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
792                 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
793                         return c;
794         }
795         return NULL;
796 }
797
798 /*
799  *      Allocate a multicast cache entry
800  */
801 static struct mfc_cache *ipmr_cache_alloc(void)
802 {
803         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
804         if (c == NULL)
805                 return NULL;
806         c->mfc_un.res.minvif = MAXVIFS;
807         return c;
808 }
809
810 static struct mfc_cache *ipmr_cache_alloc_unres(void)
811 {
812         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
813         if (c == NULL)
814                 return NULL;
815         skb_queue_head_init(&c->mfc_un.unres.unresolved);
816         c->mfc_un.unres.expires = jiffies + 10*HZ;
817         return c;
818 }
819
820 /*
821  *      A cache entry has gone into a resolved state from queued
822  */
823
824 static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
825                                struct mfc_cache *uc, struct mfc_cache *c)
826 {
827         struct sk_buff *skb;
828         struct nlmsgerr *e;
829
830         /*
831          *      Play the pending entries through our router
832          */
833
834         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
835                 if (ip_hdr(skb)->version == 0) {
836                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
837
838                         if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
839                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
840                                                   (u8 *)nlh);
841                         } else {
842                                 nlh->nlmsg_type = NLMSG_ERROR;
843                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
844                                 skb_trim(skb, nlh->nlmsg_len);
845                                 e = NLMSG_DATA(nlh);
846                                 e->error = -EMSGSIZE;
847                                 memset(&e->msg, 0, sizeof(e->msg));
848                         }
849
850                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
851                 } else
852                         ip_mr_forward(net, mrt, skb, c, 0);
853         }
854 }
855
856 /*
857  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
858  *      expects the following bizarre scheme.
859  *
860  *      Called under mrt_lock.
861  */
862
863 static int ipmr_cache_report(struct mr_table *mrt,
864                              struct sk_buff *pkt, vifi_t vifi, int assert)
865 {
866         struct sk_buff *skb;
867         const int ihl = ip_hdrlen(pkt);
868         struct igmphdr *igmp;
869         struct igmpmsg *msg;
870         int ret;
871
872 #ifdef CONFIG_IP_PIMSM
873         if (assert == IGMPMSG_WHOLEPKT)
874                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
875         else
876 #endif
877                 skb = alloc_skb(128, GFP_ATOMIC);
878
879         if (!skb)
880                 return -ENOBUFS;
881
882 #ifdef CONFIG_IP_PIMSM
883         if (assert == IGMPMSG_WHOLEPKT) {
884                 /* Ugly, but we have no choice with this interface.
885                    Duplicate old header, fix ihl, length etc.
886                    And all this only to mangle msg->im_msgtype and
887                    to set msg->im_mbz to "mbz" :-)
888                  */
889                 skb_push(skb, sizeof(struct iphdr));
890                 skb_reset_network_header(skb);
891                 skb_reset_transport_header(skb);
892                 msg = (struct igmpmsg *)skb_network_header(skb);
893                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
894                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
895                 msg->im_mbz = 0;
896                 msg->im_vif = mrt->mroute_reg_vif_num;
897                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
898                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
899                                              sizeof(struct iphdr));
900         } else
901 #endif
902         {
903
904         /*
905          *      Copy the IP header
906          */
907
908         skb->network_header = skb->tail;
909         skb_put(skb, ihl);
910         skb_copy_to_linear_data(skb, pkt->data, ihl);
911         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
912         msg = (struct igmpmsg *)skb_network_header(skb);
913         msg->im_vif = vifi;
914         skb_dst_set(skb, dst_clone(skb_dst(pkt)));
915
916         /*
917          *      Add our header
918          */
919
920         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
921         igmp->type      =
922         msg->im_msgtype = assert;
923         igmp->code      =       0;
924         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
925         skb->transport_header = skb->network_header;
926         }
927
928         if (mrt->mroute_sk == NULL) {
929                 kfree_skb(skb);
930                 return -EINVAL;
931         }
932
933         /*
934          *      Deliver to mrouted
935          */
936         ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
937         if (ret < 0) {
938                 if (net_ratelimit())
939                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
940                 kfree_skb(skb);
941         }
942
943         return ret;
944 }
945
946 /*
947  *      Queue a packet for resolution. It gets locked cache entry!
948  */
949
950 static int
951 ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
952 {
953         bool found = false;
954         int err;
955         struct mfc_cache *c;
956         const struct iphdr *iph = ip_hdr(skb);
957
958         spin_lock_bh(&mfc_unres_lock);
959         list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
960                 if (c->mfc_mcastgrp == iph->daddr &&
961                     c->mfc_origin == iph->saddr) {
962                         found = true;
963                         break;
964                 }
965         }
966
967         if (!found) {
968                 /*
969                  *      Create a new entry if allowable
970                  */
971
972                 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
973                     (c = ipmr_cache_alloc_unres()) == NULL) {
974                         spin_unlock_bh(&mfc_unres_lock);
975
976                         kfree_skb(skb);
977                         return -ENOBUFS;
978                 }
979
980                 /*
981                  *      Fill in the new cache entry
982                  */
983                 c->mfc_parent   = -1;
984                 c->mfc_origin   = iph->saddr;
985                 c->mfc_mcastgrp = iph->daddr;
986
987                 /*
988                  *      Reflect first query at mrouted.
989                  */
990                 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
991                 if (err < 0) {
992                         /* If the report failed throw the cache entry
993                            out - Brad Parker
994                          */
995                         spin_unlock_bh(&mfc_unres_lock);
996
997                         ipmr_cache_free(c);
998                         kfree_skb(skb);
999                         return err;
1000                 }
1001
1002                 atomic_inc(&mrt->cache_resolve_queue_len);
1003                 list_add(&c->list, &mrt->mfc_unres_queue);
1004
1005                 if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1006                         mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1007         }
1008
1009         /*
1010          *      See if we can append the packet
1011          */
1012         if (c->mfc_un.unres.unresolved.qlen>3) {
1013                 kfree_skb(skb);
1014                 err = -ENOBUFS;
1015         } else {
1016                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1017                 err = 0;
1018         }
1019
1020         spin_unlock_bh(&mfc_unres_lock);
1021         return err;
1022 }
1023
1024 /*
1025  *      MFC cache manipulation by user space mroute daemon
1026  */
1027
1028 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1029 {
1030         int line;
1031         struct mfc_cache *c, *next;
1032
1033         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1034
1035         list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1036                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1037                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1038                         write_lock_bh(&mrt_lock);
1039                         list_del(&c->list);
1040                         write_unlock_bh(&mrt_lock);
1041
1042                         ipmr_cache_free(c);
1043                         return 0;
1044                 }
1045         }
1046         return -ENOENT;
1047 }
1048
1049 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1050                         struct mfcctl *mfc, int mrtsock)
1051 {
1052         bool found = false;
1053         int line;
1054         struct mfc_cache *uc, *c;
1055
1056         if (mfc->mfcc_parent >= MAXVIFS)
1057                 return -ENFILE;
1058
1059         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1060
1061         list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1062                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1063                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1064                         found = true;
1065                         break;
1066                 }
1067         }
1068
1069         if (found) {
1070                 write_lock_bh(&mrt_lock);
1071                 c->mfc_parent = mfc->mfcc_parent;
1072                 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1073                 if (!mrtsock)
1074                         c->mfc_flags |= MFC_STATIC;
1075                 write_unlock_bh(&mrt_lock);
1076                 return 0;
1077         }
1078
1079         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1080                 return -EINVAL;
1081
1082         c = ipmr_cache_alloc();
1083         if (c == NULL)
1084                 return -ENOMEM;
1085
1086         c->mfc_origin = mfc->mfcc_origin.s_addr;
1087         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1088         c->mfc_parent = mfc->mfcc_parent;
1089         ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1090         if (!mrtsock)
1091                 c->mfc_flags |= MFC_STATIC;
1092
1093         write_lock_bh(&mrt_lock);
1094         list_add(&c->list, &mrt->mfc_cache_array[line]);
1095         write_unlock_bh(&mrt_lock);
1096
1097         /*
1098          *      Check to see if we resolved a queued list. If so we
1099          *      need to send on the frames and tidy up.
1100          */
1101         found = false;
1102         spin_lock_bh(&mfc_unres_lock);
1103         list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1104                 if (uc->mfc_origin == c->mfc_origin &&
1105                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1106                         list_del(&uc->list);
1107                         atomic_dec(&mrt->cache_resolve_queue_len);
1108                         found = true;
1109                         break;
1110                 }
1111         }
1112         if (list_empty(&mrt->mfc_unres_queue))
1113                 del_timer(&mrt->ipmr_expire_timer);
1114         spin_unlock_bh(&mfc_unres_lock);
1115
1116         if (found) {
1117                 ipmr_cache_resolve(net, mrt, uc, c);
1118                 ipmr_cache_free(uc);
1119         }
1120         return 0;
1121 }
1122
1123 /*
1124  *      Close the multicast socket, and clear the vif tables etc
1125  */
1126
1127 static void mroute_clean_tables(struct mr_table *mrt)
1128 {
1129         int i;
1130         LIST_HEAD(list);
1131         struct mfc_cache *c, *next;
1132
1133         /*
1134          *      Shut down all active vif entries
1135          */
1136         for (i = 0; i < mrt->maxvif; i++) {
1137                 if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1138                         vif_delete(mrt, i, 0, &list);
1139         }
1140         unregister_netdevice_many(&list);
1141
1142         /*
1143          *      Wipe the cache
1144          */
1145         for (i = 0; i < MFC_LINES; i++) {
1146                 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1147                         if (c->mfc_flags&MFC_STATIC)
1148                                 continue;
1149                         write_lock_bh(&mrt_lock);
1150                         list_del(&c->list);
1151                         write_unlock_bh(&mrt_lock);
1152
1153                         ipmr_cache_free(c);
1154                 }
1155         }
1156
1157         if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1158                 spin_lock_bh(&mfc_unres_lock);
1159                 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1160                         list_del(&c->list);
1161                         ipmr_destroy_unres(mrt, c);
1162                 }
1163                 spin_unlock_bh(&mfc_unres_lock);
1164         }
1165 }
1166
1167 static void mrtsock_destruct(struct sock *sk)
1168 {
1169         struct net *net = sock_net(sk);
1170         struct mr_table *mrt;
1171
1172         rtnl_lock();
1173         ipmr_for_each_table(mrt, net) {
1174                 if (sk == mrt->mroute_sk) {
1175                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1176
1177                         write_lock_bh(&mrt_lock);
1178                         mrt->mroute_sk = NULL;
1179                         write_unlock_bh(&mrt_lock);
1180
1181                         mroute_clean_tables(mrt);
1182                 }
1183         }
1184         rtnl_unlock();
1185 }
1186
1187 /*
1188  *      Socket options and virtual interface manipulation. The whole
1189  *      virtual interface system is a complete heap, but unfortunately
1190  *      that's how BSD mrouted happens to think. Maybe one day with a proper
1191  *      MOSPF/PIM router set up we can clean this up.
1192  */
1193
1194 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1195 {
1196         int ret;
1197         struct vifctl vif;
1198         struct mfcctl mfc;
1199         struct net *net = sock_net(sk);
1200         struct mr_table *mrt;
1201
1202         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1203         if (mrt == NULL)
1204                 return -ENOENT;
1205
1206         if (optname != MRT_INIT) {
1207                 if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
1208                         return -EACCES;
1209         }
1210
1211         switch (optname) {
1212         case MRT_INIT:
1213                 if (sk->sk_type != SOCK_RAW ||
1214                     inet_sk(sk)->inet_num != IPPROTO_IGMP)
1215                         return -EOPNOTSUPP;
1216                 if (optlen != sizeof(int))
1217                         return -ENOPROTOOPT;
1218
1219                 rtnl_lock();
1220                 if (mrt->mroute_sk) {
1221                         rtnl_unlock();
1222                         return -EADDRINUSE;
1223                 }
1224
1225                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1226                 if (ret == 0) {
1227                         write_lock_bh(&mrt_lock);
1228                         mrt->mroute_sk = sk;
1229                         write_unlock_bh(&mrt_lock);
1230
1231                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1232                 }
1233                 rtnl_unlock();
1234                 return ret;
1235         case MRT_DONE:
1236                 if (sk != mrt->mroute_sk)
1237                         return -EACCES;
1238                 return ip_ra_control(sk, 0, NULL);
1239         case MRT_ADD_VIF:
1240         case MRT_DEL_VIF:
1241                 if (optlen != sizeof(vif))
1242                         return -EINVAL;
1243                 if (copy_from_user(&vif, optval, sizeof(vif)))
1244                         return -EFAULT;
1245                 if (vif.vifc_vifi >= MAXVIFS)
1246                         return -ENFILE;
1247                 rtnl_lock();
1248                 if (optname == MRT_ADD_VIF) {
1249                         ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
1250                 } else {
1251                         ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1252                 }
1253                 rtnl_unlock();
1254                 return ret;
1255
1256                 /*
1257                  *      Manipulate the forwarding caches. These live
1258                  *      in a sort of kernel/user symbiosis.
1259                  */
1260         case MRT_ADD_MFC:
1261         case MRT_DEL_MFC:
1262                 if (optlen != sizeof(mfc))
1263                         return -EINVAL;
1264                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1265                         return -EFAULT;
1266                 rtnl_lock();
1267                 if (optname == MRT_DEL_MFC)
1268                         ret = ipmr_mfc_delete(mrt, &mfc);
1269                 else
1270                         ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
1271                 rtnl_unlock();
1272                 return ret;
1273                 /*
1274                  *      Control PIM assert.
1275                  */
1276         case MRT_ASSERT:
1277         {
1278                 int v;
1279                 if (get_user(v,(int __user *)optval))
1280                         return -EFAULT;
1281                 mrt->mroute_do_assert = (v) ? 1 : 0;
1282                 return 0;
1283         }
1284 #ifdef CONFIG_IP_PIMSM
1285         case MRT_PIM:
1286         {
1287                 int v;
1288
1289                 if (get_user(v,(int __user *)optval))
1290                         return -EFAULT;
1291                 v = (v) ? 1 : 0;
1292
1293                 rtnl_lock();
1294                 ret = 0;
1295                 if (v != mrt->mroute_do_pim) {
1296                         mrt->mroute_do_pim = v;
1297                         mrt->mroute_do_assert = v;
1298                 }
1299                 rtnl_unlock();
1300                 return ret;
1301         }
1302 #endif
1303 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1304         case MRT_TABLE:
1305         {
1306                 u32 v;
1307
1308                 if (optlen != sizeof(u32))
1309                         return -EINVAL;
1310                 if (get_user(v, (u32 __user *)optval))
1311                         return -EFAULT;
1312                 if (sk == mrt->mroute_sk)
1313                         return -EBUSY;
1314
1315                 rtnl_lock();
1316                 ret = 0;
1317                 if (!ipmr_new_table(net, v))
1318                         ret = -ENOMEM;
1319                 raw_sk(sk)->ipmr_table = v;
1320                 rtnl_unlock();
1321                 return ret;
1322         }
1323 #endif
1324         /*
1325          *      Spurious command, or MRT_VERSION which you cannot
1326          *      set.
1327          */
1328         default:
1329                 return -ENOPROTOOPT;
1330         }
1331 }
1332
1333 /*
1334  *      Getsock opt support for the multicast routing system.
1335  */
1336
1337 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1338 {
1339         int olr;
1340         int val;
1341         struct net *net = sock_net(sk);
1342         struct mr_table *mrt;
1343
1344         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1345         if (mrt == NULL)
1346                 return -ENOENT;
1347
1348         if (optname != MRT_VERSION &&
1349 #ifdef CONFIG_IP_PIMSM
1350            optname!=MRT_PIM &&
1351 #endif
1352            optname!=MRT_ASSERT)
1353                 return -ENOPROTOOPT;
1354
1355         if (get_user(olr, optlen))
1356                 return -EFAULT;
1357
1358         olr = min_t(unsigned int, olr, sizeof(int));
1359         if (olr < 0)
1360                 return -EINVAL;
1361
1362         if (put_user(olr, optlen))
1363                 return -EFAULT;
1364         if (optname == MRT_VERSION)
1365                 val = 0x0305;
1366 #ifdef CONFIG_IP_PIMSM
1367         else if (optname == MRT_PIM)
1368                 val = mrt->mroute_do_pim;
1369 #endif
1370         else
1371                 val = mrt->mroute_do_assert;
1372         if (copy_to_user(optval, &val, olr))
1373                 return -EFAULT;
1374         return 0;
1375 }
1376
1377 /*
1378  *      The IP multicast ioctl support routines.
1379  */
1380
1381 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1382 {
1383         struct sioc_sg_req sr;
1384         struct sioc_vif_req vr;
1385         struct vif_device *vif;
1386         struct mfc_cache *c;
1387         struct net *net = sock_net(sk);
1388         struct mr_table *mrt;
1389
1390         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1391         if (mrt == NULL)
1392                 return -ENOENT;
1393
1394         switch (cmd) {
1395         case SIOCGETVIFCNT:
1396                 if (copy_from_user(&vr, arg, sizeof(vr)))
1397                         return -EFAULT;
1398                 if (vr.vifi >= mrt->maxvif)
1399                         return -EINVAL;
1400                 read_lock(&mrt_lock);
1401                 vif = &mrt->vif_table[vr.vifi];
1402                 if (VIF_EXISTS(mrt, vr.vifi)) {
1403                         vr.icount = vif->pkt_in;
1404                         vr.ocount = vif->pkt_out;
1405                         vr.ibytes = vif->bytes_in;
1406                         vr.obytes = vif->bytes_out;
1407                         read_unlock(&mrt_lock);
1408
1409                         if (copy_to_user(arg, &vr, sizeof(vr)))
1410                                 return -EFAULT;
1411                         return 0;
1412                 }
1413                 read_unlock(&mrt_lock);
1414                 return -EADDRNOTAVAIL;
1415         case SIOCGETSGCNT:
1416                 if (copy_from_user(&sr, arg, sizeof(sr)))
1417                         return -EFAULT;
1418
1419                 read_lock(&mrt_lock);
1420                 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1421                 if (c) {
1422                         sr.pktcnt = c->mfc_un.res.pkt;
1423                         sr.bytecnt = c->mfc_un.res.bytes;
1424                         sr.wrong_if = c->mfc_un.res.wrong_if;
1425                         read_unlock(&mrt_lock);
1426
1427                         if (copy_to_user(arg, &sr, sizeof(sr)))
1428                                 return -EFAULT;
1429                         return 0;
1430                 }
1431                 read_unlock(&mrt_lock);
1432                 return -EADDRNOTAVAIL;
1433         default:
1434                 return -ENOIOCTLCMD;
1435         }
1436 }
1437
1438
1439 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1440 {
1441         struct net_device *dev = ptr;
1442         struct net *net = dev_net(dev);
1443         struct mr_table *mrt;
1444         struct vif_device *v;
1445         int ct;
1446         LIST_HEAD(list);
1447
1448         if (event != NETDEV_UNREGISTER)
1449                 return NOTIFY_DONE;
1450
1451         ipmr_for_each_table(mrt, net) {
1452                 v = &mrt->vif_table[0];
1453                 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1454                         if (v->dev == dev)
1455                                 vif_delete(mrt, ct, 1, &list);
1456                 }
1457         }
1458         unregister_netdevice_many(&list);
1459         return NOTIFY_DONE;
1460 }
1461
1462
1463 static struct notifier_block ip_mr_notifier = {
1464         .notifier_call = ipmr_device_event,
1465 };
1466
1467 /*
1468  *      Encapsulate a packet by attaching a valid IPIP header to it.
1469  *      This avoids tunnel drivers and other mess and gives us the speed so
1470  *      important for multicast video.
1471  */
1472
1473 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1474 {
1475         struct iphdr *iph;
1476         struct iphdr *old_iph = ip_hdr(skb);
1477
1478         skb_push(skb, sizeof(struct iphdr));
1479         skb->transport_header = skb->network_header;
1480         skb_reset_network_header(skb);
1481         iph = ip_hdr(skb);
1482
1483         iph->version    =       4;
1484         iph->tos        =       old_iph->tos;
1485         iph->ttl        =       old_iph->ttl;
1486         iph->frag_off   =       0;
1487         iph->daddr      =       daddr;
1488         iph->saddr      =       saddr;
1489         iph->protocol   =       IPPROTO_IPIP;
1490         iph->ihl        =       5;
1491         iph->tot_len    =       htons(skb->len);
1492         ip_select_ident(iph, skb_dst(skb), NULL);
1493         ip_send_check(iph);
1494
1495         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1496         nf_reset(skb);
1497 }
1498
1499 static inline int ipmr_forward_finish(struct sk_buff *skb)
1500 {
1501         struct ip_options * opt = &(IPCB(skb)->opt);
1502
1503         IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1504
1505         if (unlikely(opt->optlen))
1506                 ip_forward_options(skb);
1507
1508         return dst_output(skb);
1509 }
1510
1511 /*
1512  *      Processing handlers for ipmr_forward
1513  */
1514
1515 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1516                             struct sk_buff *skb, struct mfc_cache *c, int vifi)
1517 {
1518         const struct iphdr *iph = ip_hdr(skb);
1519         struct vif_device *vif = &mrt->vif_table[vifi];
1520         struct net_device *dev;
1521         struct rtable *rt;
1522         int    encap = 0;
1523
1524         if (vif->dev == NULL)
1525                 goto out_free;
1526
1527 #ifdef CONFIG_IP_PIMSM
1528         if (vif->flags & VIFF_REGISTER) {
1529                 vif->pkt_out++;
1530                 vif->bytes_out += skb->len;
1531                 vif->dev->stats.tx_bytes += skb->len;
1532                 vif->dev->stats.tx_packets++;
1533                 ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1534                 goto out_free;
1535         }
1536 #endif
1537
1538         if (vif->flags&VIFF_TUNNEL) {
1539                 struct flowi fl = { .oif = vif->link,
1540                                     .nl_u = { .ip4_u =
1541                                               { .daddr = vif->remote,
1542                                                 .saddr = vif->local,
1543                                                 .tos = RT_TOS(iph->tos) } },
1544                                     .proto = IPPROTO_IPIP };
1545                 if (ip_route_output_key(net, &rt, &fl))
1546                         goto out_free;
1547                 encap = sizeof(struct iphdr);
1548         } else {
1549                 struct flowi fl = { .oif = vif->link,
1550                                     .nl_u = { .ip4_u =
1551                                               { .daddr = iph->daddr,
1552                                                 .tos = RT_TOS(iph->tos) } },
1553                                     .proto = IPPROTO_IPIP };
1554                 if (ip_route_output_key(net, &rt, &fl))
1555                         goto out_free;
1556         }
1557
1558         dev = rt->dst.dev;
1559
1560         if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1561                 /* Do not fragment multicasts. Alas, IPv4 does not
1562                    allow to send ICMP, so that packets will disappear
1563                    to blackhole.
1564                  */
1565
1566                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1567                 ip_rt_put(rt);
1568                 goto out_free;
1569         }
1570
1571         encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1572
1573         if (skb_cow(skb, encap)) {
1574                 ip_rt_put(rt);
1575                 goto out_free;
1576         }
1577
1578         vif->pkt_out++;
1579         vif->bytes_out += skb->len;
1580
1581         skb_dst_drop(skb);
1582         skb_dst_set(skb, &rt->dst);
1583         ip_decrease_ttl(ip_hdr(skb));
1584
1585         /* FIXME: forward and output firewalls used to be called here.
1586          * What do we do with netfilter? -- RR */
1587         if (vif->flags & VIFF_TUNNEL) {
1588                 ip_encap(skb, vif->local, vif->remote);
1589                 /* FIXME: extra output firewall step used to be here. --RR */
1590                 vif->dev->stats.tx_packets++;
1591                 vif->dev->stats.tx_bytes += skb->len;
1592         }
1593
1594         IPCB(skb)->flags |= IPSKB_FORWARDED;
1595
1596         /*
1597          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1598          * not only before forwarding, but after forwarding on all output
1599          * interfaces. It is clear, if mrouter runs a multicasting
1600          * program, it should receive packets not depending to what interface
1601          * program is joined.
1602          * If we will not make it, the program will have to join on all
1603          * interfaces. On the other hand, multihoming host (or router, but
1604          * not mrouter) cannot join to more than one interface - it will
1605          * result in receiving multiple packets.
1606          */
1607         NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1608                 ipmr_forward_finish);
1609         return;
1610
1611 out_free:
1612         kfree_skb(skb);
1613 }
1614
1615 static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1616 {
1617         int ct;
1618
1619         for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1620                 if (mrt->vif_table[ct].dev == dev)
1621                         break;
1622         }
1623         return ct;
1624 }
1625
1626 /* "local" means that we should preserve one skb (for local delivery) */
1627
1628 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1629                          struct sk_buff *skb, struct mfc_cache *cache,
1630                          int local)
1631 {
1632         int psend = -1;
1633         int vif, ct;
1634
1635         vif = cache->mfc_parent;
1636         cache->mfc_un.res.pkt++;
1637         cache->mfc_un.res.bytes += skb->len;
1638
1639         /*
1640          * Wrong interface: drop packet and (maybe) send PIM assert.
1641          */
1642         if (mrt->vif_table[vif].dev != skb->dev) {
1643                 int true_vifi;
1644
1645                 if (skb_rtable(skb)->fl.iif == 0) {
1646                         /* It is our own packet, looped back.
1647                            Very complicated situation...
1648
1649                            The best workaround until routing daemons will be
1650                            fixed is not to redistribute packet, if it was
1651                            send through wrong interface. It means, that
1652                            multicast applications WILL NOT work for
1653                            (S,G), which have default multicast route pointing
1654                            to wrong oif. In any case, it is not a good
1655                            idea to use multicasting applications on router.
1656                          */
1657                         goto dont_forward;
1658                 }
1659
1660                 cache->mfc_un.res.wrong_if++;
1661                 true_vifi = ipmr_find_vif(mrt, skb->dev);
1662
1663                 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1664                     /* pimsm uses asserts, when switching from RPT to SPT,
1665                        so that we cannot check that packet arrived on an oif.
1666                        It is bad, but otherwise we would need to move pretty
1667                        large chunk of pimd to kernel. Ough... --ANK
1668                      */
1669                     (mrt->mroute_do_pim ||
1670                      cache->mfc_un.res.ttls[true_vifi] < 255) &&
1671                     time_after(jiffies,
1672                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1673                         cache->mfc_un.res.last_assert = jiffies;
1674                         ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1675                 }
1676                 goto dont_forward;
1677         }
1678
1679         mrt->vif_table[vif].pkt_in++;
1680         mrt->vif_table[vif].bytes_in += skb->len;
1681
1682         /*
1683          *      Forward the frame
1684          */
1685         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1686                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1687                         if (psend != -1) {
1688                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1689                                 if (skb2)
1690                                         ipmr_queue_xmit(net, mrt, skb2, cache,
1691                                                         psend);
1692                         }
1693                         psend = ct;
1694                 }
1695         }
1696         if (psend != -1) {
1697                 if (local) {
1698                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1699                         if (skb2)
1700                                 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1701                 } else {
1702                         ipmr_queue_xmit(net, mrt, skb, cache, psend);
1703                         return 0;
1704                 }
1705         }
1706
1707 dont_forward:
1708         if (!local)
1709                 kfree_skb(skb);
1710         return 0;
1711 }
1712
1713
1714 /*
1715  *      Multicast packets for forwarding arrive here
1716  */
1717
1718 int ip_mr_input(struct sk_buff *skb)
1719 {
1720         struct mfc_cache *cache;
1721         struct net *net = dev_net(skb->dev);
1722         int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1723         struct mr_table *mrt;
1724         int err;
1725
1726         /* Packet is looped back after forward, it should not be
1727            forwarded second time, but still can be delivered locally.
1728          */
1729         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1730                 goto dont_forward;
1731
1732         err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1733         if (err < 0) {
1734                 kfree_skb(skb);
1735                 return err;
1736         }
1737
1738         if (!local) {
1739                     if (IPCB(skb)->opt.router_alert) {
1740                             if (ip_call_ra_chain(skb))
1741                                     return 0;
1742                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1743                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1744                                Cisco IOS <= 11.2(8)) do not put router alert
1745                                option to IGMP packets destined to routable
1746                                groups. It is very bad, because it means
1747                                that we can forward NO IGMP messages.
1748                              */
1749                             read_lock(&mrt_lock);
1750                             if (mrt->mroute_sk) {
1751                                     nf_reset(skb);
1752                                     raw_rcv(mrt->mroute_sk, skb);
1753                                     read_unlock(&mrt_lock);
1754                                     return 0;
1755                             }
1756                             read_unlock(&mrt_lock);
1757                     }
1758         }
1759
1760         read_lock(&mrt_lock);
1761         cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1762
1763         /*
1764          *      No usable cache entry
1765          */
1766         if (cache == NULL) {
1767                 int vif;
1768
1769                 if (local) {
1770                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1771                         ip_local_deliver(skb);
1772                         if (skb2 == NULL) {
1773                                 read_unlock(&mrt_lock);
1774                                 return -ENOBUFS;
1775                         }
1776                         skb = skb2;
1777                 }
1778
1779                 vif = ipmr_find_vif(mrt, skb->dev);
1780                 if (vif >= 0) {
1781                         int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1782                         read_unlock(&mrt_lock);
1783
1784                         return err2;
1785                 }
1786                 read_unlock(&mrt_lock);
1787                 kfree_skb(skb);
1788                 return -ENODEV;
1789         }
1790
1791         ip_mr_forward(net, mrt, skb, cache, local);
1792
1793         read_unlock(&mrt_lock);
1794
1795         if (local)
1796                 return ip_local_deliver(skb);
1797
1798         return 0;
1799
1800 dont_forward:
1801         if (local)
1802                 return ip_local_deliver(skb);
1803         kfree_skb(skb);
1804         return 0;
1805 }
1806
1807 #ifdef CONFIG_IP_PIMSM
1808 /* called with rcu_read_lock() */
1809 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1810                      unsigned int pimlen)
1811 {
1812         struct net_device *reg_dev = NULL;
1813         struct iphdr *encap;
1814
1815         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1816         /*
1817            Check that:
1818            a. packet is really destinted to a multicast group
1819            b. packet is not a NULL-REGISTER
1820            c. packet is not truncated
1821          */
1822         if (!ipv4_is_multicast(encap->daddr) ||
1823             encap->tot_len == 0 ||
1824             ntohs(encap->tot_len) + pimlen > skb->len)
1825                 return 1;
1826
1827         read_lock(&mrt_lock);
1828         if (mrt->mroute_reg_vif_num >= 0)
1829                 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1830         read_unlock(&mrt_lock);
1831
1832         if (reg_dev == NULL)
1833                 return 1;
1834
1835         skb->mac_header = skb->network_header;
1836         skb_pull(skb, (u8 *)encap - skb->data);
1837         skb_reset_network_header(skb);
1838         skb->protocol = htons(ETH_P_IP);
1839         skb->ip_summed = CHECKSUM_NONE;
1840         skb->pkt_type = PACKET_HOST;
1841
1842         skb_tunnel_rx(skb, reg_dev);
1843
1844         netif_rx(skb);
1845
1846         return NET_RX_SUCCESS;
1847 }
1848 #endif
1849
1850 #ifdef CONFIG_IP_PIMSM_V1
1851 /*
1852  * Handle IGMP messages of PIMv1
1853  */
1854
1855 int pim_rcv_v1(struct sk_buff * skb)
1856 {
1857         struct igmphdr *pim;
1858         struct net *net = dev_net(skb->dev);
1859         struct mr_table *mrt;
1860
1861         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1862                 goto drop;
1863
1864         pim = igmp_hdr(skb);
1865
1866         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1867                 goto drop;
1868
1869         if (!mrt->mroute_do_pim ||
1870             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1871                 goto drop;
1872
1873         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1874 drop:
1875                 kfree_skb(skb);
1876         }
1877         return 0;
1878 }
1879 #endif
1880
1881 #ifdef CONFIG_IP_PIMSM_V2
1882 static int pim_rcv(struct sk_buff * skb)
1883 {
1884         struct pimreghdr *pim;
1885         struct net *net = dev_net(skb->dev);
1886         struct mr_table *mrt;
1887
1888         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1889                 goto drop;
1890
1891         pim = (struct pimreghdr *)skb_transport_header(skb);
1892         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1893             (pim->flags&PIM_NULL_REGISTER) ||
1894             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1895              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1896                 goto drop;
1897
1898         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1899                 goto drop;
1900
1901         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1902 drop:
1903                 kfree_skb(skb);
1904         }
1905         return 0;
1906 }
1907 #endif
1908
1909 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1910                               struct mfc_cache *c, struct rtmsg *rtm)
1911 {
1912         int ct;
1913         struct rtnexthop *nhp;
1914         u8 *b = skb_tail_pointer(skb);
1915         struct rtattr *mp_head;
1916
1917         /* If cache is unresolved, don't try to parse IIF and OIF */
1918         if (c->mfc_parent >= MAXVIFS)
1919                 return -ENOENT;
1920
1921         if (VIF_EXISTS(mrt, c->mfc_parent))
1922                 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1923
1924         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1925
1926         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1927                 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1928                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1929                                 goto rtattr_failure;
1930                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1931                         nhp->rtnh_flags = 0;
1932                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1933                         nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1934                         nhp->rtnh_len = sizeof(*nhp);
1935                 }
1936         }
1937         mp_head->rta_type = RTA_MULTIPATH;
1938         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1939         rtm->rtm_type = RTN_MULTICAST;
1940         return 1;
1941
1942 rtattr_failure:
1943         nlmsg_trim(skb, b);
1944         return -EMSGSIZE;
1945 }
1946
1947 int ipmr_get_route(struct net *net,
1948                    struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1949 {
1950         int err;
1951         struct mr_table *mrt;
1952         struct mfc_cache *cache;
1953         struct rtable *rt = skb_rtable(skb);
1954
1955         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1956         if (mrt == NULL)
1957                 return -ENOENT;
1958
1959         read_lock(&mrt_lock);
1960         cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1961
1962         if (cache == NULL) {
1963                 struct sk_buff *skb2;
1964                 struct iphdr *iph;
1965                 struct net_device *dev;
1966                 int vif;
1967
1968                 if (nowait) {
1969                         read_unlock(&mrt_lock);
1970                         return -EAGAIN;
1971                 }
1972
1973                 dev = skb->dev;
1974                 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1975                         read_unlock(&mrt_lock);
1976                         return -ENODEV;
1977                 }
1978                 skb2 = skb_clone(skb, GFP_ATOMIC);
1979                 if (!skb2) {
1980                         read_unlock(&mrt_lock);
1981                         return -ENOMEM;
1982                 }
1983
1984                 skb_push(skb2, sizeof(struct iphdr));
1985                 skb_reset_network_header(skb2);
1986                 iph = ip_hdr(skb2);
1987                 iph->ihl = sizeof(struct iphdr) >> 2;
1988                 iph->saddr = rt->rt_src;
1989                 iph->daddr = rt->rt_dst;
1990                 iph->version = 0;
1991                 err = ipmr_cache_unresolved(mrt, vif, skb2);
1992                 read_unlock(&mrt_lock);
1993                 return err;
1994         }
1995
1996         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1997                 cache->mfc_flags |= MFC_NOTIFY;
1998         err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
1999         read_unlock(&mrt_lock);
2000         return err;
2001 }
2002
2003 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2004                             u32 pid, u32 seq, struct mfc_cache *c)
2005 {
2006         struct nlmsghdr *nlh;
2007         struct rtmsg *rtm;
2008
2009         nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2010         if (nlh == NULL)
2011                 return -EMSGSIZE;
2012
2013         rtm = nlmsg_data(nlh);
2014         rtm->rtm_family   = RTNL_FAMILY_IPMR;
2015         rtm->rtm_dst_len  = 32;
2016         rtm->rtm_src_len  = 32;
2017         rtm->rtm_tos      = 0;
2018         rtm->rtm_table    = mrt->id;
2019         NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2020         rtm->rtm_type     = RTN_MULTICAST;
2021         rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2022         rtm->rtm_protocol = RTPROT_UNSPEC;
2023         rtm->rtm_flags    = 0;
2024
2025         NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2026         NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2027
2028         if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2029                 goto nla_put_failure;
2030
2031         return nlmsg_end(skb, nlh);
2032
2033 nla_put_failure:
2034         nlmsg_cancel(skb, nlh);
2035         return -EMSGSIZE;
2036 }
2037
2038 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2039 {
2040         struct net *net = sock_net(skb->sk);
2041         struct mr_table *mrt;
2042         struct mfc_cache *mfc;
2043         unsigned int t = 0, s_t;
2044         unsigned int h = 0, s_h;
2045         unsigned int e = 0, s_e;
2046
2047         s_t = cb->args[0];
2048         s_h = cb->args[1];
2049         s_e = cb->args[2];
2050
2051         read_lock(&mrt_lock);
2052         ipmr_for_each_table(mrt, net) {
2053                 if (t < s_t)
2054                         goto next_table;
2055                 if (t > s_t)
2056                         s_h = 0;
2057                 for (h = s_h; h < MFC_LINES; h++) {
2058                         list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2059                                 if (e < s_e)
2060                                         goto next_entry;
2061                                 if (ipmr_fill_mroute(mrt, skb,
2062                                                      NETLINK_CB(cb->skb).pid,
2063                                                      cb->nlh->nlmsg_seq,
2064                                                      mfc) < 0)
2065                                         goto done;
2066 next_entry:
2067                                 e++;
2068                         }
2069                         e = s_e = 0;
2070                 }
2071                 s_h = 0;
2072 next_table:
2073                 t++;
2074         }
2075 done:
2076         read_unlock(&mrt_lock);
2077
2078         cb->args[2] = e;
2079         cb->args[1] = h;
2080         cb->args[0] = t;
2081
2082         return skb->len;
2083 }
2084
2085 #ifdef CONFIG_PROC_FS
2086 /*
2087  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2088  */
2089 struct ipmr_vif_iter {
2090         struct seq_net_private p;
2091         struct mr_table *mrt;
2092         int ct;
2093 };
2094
2095 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2096                                            struct ipmr_vif_iter *iter,
2097                                            loff_t pos)
2098 {
2099         struct mr_table *mrt = iter->mrt;
2100
2101         for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2102                 if (!VIF_EXISTS(mrt, iter->ct))
2103                         continue;
2104                 if (pos-- == 0)
2105                         return &mrt->vif_table[iter->ct];
2106         }
2107         return NULL;
2108 }
2109
2110 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2111         __acquires(mrt_lock)
2112 {
2113         struct ipmr_vif_iter *iter = seq->private;
2114         struct net *net = seq_file_net(seq);
2115         struct mr_table *mrt;
2116
2117         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2118         if (mrt == NULL)
2119                 return ERR_PTR(-ENOENT);
2120
2121         iter->mrt = mrt;
2122
2123         read_lock(&mrt_lock);
2124         return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2125                 : SEQ_START_TOKEN;
2126 }
2127
2128 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2129 {
2130         struct ipmr_vif_iter *iter = seq->private;
2131         struct net *net = seq_file_net(seq);
2132         struct mr_table *mrt = iter->mrt;
2133
2134         ++*pos;
2135         if (v == SEQ_START_TOKEN)
2136                 return ipmr_vif_seq_idx(net, iter, 0);
2137
2138         while (++iter->ct < mrt->maxvif) {
2139                 if (!VIF_EXISTS(mrt, iter->ct))
2140                         continue;
2141                 return &mrt->vif_table[iter->ct];
2142         }
2143         return NULL;
2144 }
2145
2146 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2147         __releases(mrt_lock)
2148 {
2149         read_unlock(&mrt_lock);
2150 }
2151
2152 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2153 {
2154         struct ipmr_vif_iter *iter = seq->private;
2155         struct mr_table *mrt = iter->mrt;
2156
2157         if (v == SEQ_START_TOKEN) {
2158                 seq_puts(seq,
2159                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2160         } else {
2161                 const struct vif_device *vif = v;
2162                 const char *name =  vif->dev ? vif->dev->name : "none";
2163
2164                 seq_printf(seq,
2165                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2166                            vif - mrt->vif_table,
2167                            name, vif->bytes_in, vif->pkt_in,
2168                            vif->bytes_out, vif->pkt_out,
2169                            vif->flags, vif->local, vif->remote);
2170         }
2171         return 0;
2172 }
2173
2174 static const struct seq_operations ipmr_vif_seq_ops = {
2175         .start = ipmr_vif_seq_start,
2176         .next  = ipmr_vif_seq_next,
2177         .stop  = ipmr_vif_seq_stop,
2178         .show  = ipmr_vif_seq_show,
2179 };
2180
2181 static int ipmr_vif_open(struct inode *inode, struct file *file)
2182 {
2183         return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2184                             sizeof(struct ipmr_vif_iter));
2185 }
2186
2187 static const struct file_operations ipmr_vif_fops = {
2188         .owner   = THIS_MODULE,
2189         .open    = ipmr_vif_open,
2190         .read    = seq_read,
2191         .llseek  = seq_lseek,
2192         .release = seq_release_net,
2193 };
2194
2195 struct ipmr_mfc_iter {
2196         struct seq_net_private p;
2197         struct mr_table *mrt;
2198         struct list_head *cache;
2199         int ct;
2200 };
2201
2202
2203 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2204                                           struct ipmr_mfc_iter *it, loff_t pos)
2205 {
2206         struct mr_table *mrt = it->mrt;
2207         struct mfc_cache *mfc;
2208
2209         read_lock(&mrt_lock);
2210         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2211                 it->cache = &mrt->mfc_cache_array[it->ct];
2212                 list_for_each_entry(mfc, it->cache, list)
2213                         if (pos-- == 0)
2214                                 return mfc;
2215         }
2216         read_unlock(&mrt_lock);
2217
2218         spin_lock_bh(&mfc_unres_lock);
2219         it->cache = &mrt->mfc_unres_queue;
2220         list_for_each_entry(mfc, it->cache, list)
2221                 if (pos-- == 0)
2222                         return mfc;
2223         spin_unlock_bh(&mfc_unres_lock);
2224
2225         it->cache = NULL;
2226         return NULL;
2227 }
2228
2229
2230 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2231 {
2232         struct ipmr_mfc_iter *it = seq->private;
2233         struct net *net = seq_file_net(seq);
2234         struct mr_table *mrt;
2235
2236         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2237         if (mrt == NULL)
2238                 return ERR_PTR(-ENOENT);
2239
2240         it->mrt = mrt;
2241         it->cache = NULL;
2242         it->ct = 0;
2243         return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2244                 : SEQ_START_TOKEN;
2245 }
2246
2247 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2248 {
2249         struct mfc_cache *mfc = v;
2250         struct ipmr_mfc_iter *it = seq->private;
2251         struct net *net = seq_file_net(seq);
2252         struct mr_table *mrt = it->mrt;
2253
2254         ++*pos;
2255
2256         if (v == SEQ_START_TOKEN)
2257                 return ipmr_mfc_seq_idx(net, seq->private, 0);
2258
2259         if (mfc->list.next != it->cache)
2260                 return list_entry(mfc->list.next, struct mfc_cache, list);
2261
2262         if (it->cache == &mrt->mfc_unres_queue)
2263                 goto end_of_list;
2264
2265         BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2266
2267         while (++it->ct < MFC_LINES) {
2268                 it->cache = &mrt->mfc_cache_array[it->ct];
2269                 if (list_empty(it->cache))
2270                         continue;
2271                 return list_first_entry(it->cache, struct mfc_cache, list);
2272         }
2273
2274         /* exhausted cache_array, show unresolved */
2275         read_unlock(&mrt_lock);
2276         it->cache = &mrt->mfc_unres_queue;
2277         it->ct = 0;
2278
2279         spin_lock_bh(&mfc_unres_lock);
2280         if (!list_empty(it->cache))
2281                 return list_first_entry(it->cache, struct mfc_cache, list);
2282
2283  end_of_list:
2284         spin_unlock_bh(&mfc_unres_lock);
2285         it->cache = NULL;
2286
2287         return NULL;
2288 }
2289
2290 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2291 {
2292         struct ipmr_mfc_iter *it = seq->private;
2293         struct mr_table *mrt = it->mrt;
2294
2295         if (it->cache == &mrt->mfc_unres_queue)
2296                 spin_unlock_bh(&mfc_unres_lock);
2297         else if (it->cache == &mrt->mfc_cache_array[it->ct])
2298                 read_unlock(&mrt_lock);
2299 }
2300
2301 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2302 {
2303         int n;
2304
2305         if (v == SEQ_START_TOKEN) {
2306                 seq_puts(seq,
2307                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2308         } else {
2309                 const struct mfc_cache *mfc = v;
2310                 const struct ipmr_mfc_iter *it = seq->private;
2311                 const struct mr_table *mrt = it->mrt;
2312
2313                 seq_printf(seq, "%08X %08X %-3hd",
2314                            (__force u32) mfc->mfc_mcastgrp,
2315                            (__force u32) mfc->mfc_origin,
2316                            mfc->mfc_parent);
2317
2318                 if (it->cache != &mrt->mfc_unres_queue) {
2319                         seq_printf(seq, " %8lu %8lu %8lu",
2320                                    mfc->mfc_un.res.pkt,
2321                                    mfc->mfc_un.res.bytes,
2322                                    mfc->mfc_un.res.wrong_if);
2323                         for (n = mfc->mfc_un.res.minvif;
2324                              n < mfc->mfc_un.res.maxvif; n++ ) {
2325                                 if (VIF_EXISTS(mrt, n) &&
2326                                     mfc->mfc_un.res.ttls[n] < 255)
2327                                         seq_printf(seq,
2328                                            " %2d:%-3d",
2329                                            n, mfc->mfc_un.res.ttls[n]);
2330                         }
2331                 } else {
2332                         /* unresolved mfc_caches don't contain
2333                          * pkt, bytes and wrong_if values
2334                          */
2335                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2336                 }
2337                 seq_putc(seq, '\n');
2338         }
2339         return 0;
2340 }
2341
2342 static const struct seq_operations ipmr_mfc_seq_ops = {
2343         .start = ipmr_mfc_seq_start,
2344         .next  = ipmr_mfc_seq_next,
2345         .stop  = ipmr_mfc_seq_stop,
2346         .show  = ipmr_mfc_seq_show,
2347 };
2348
2349 static int ipmr_mfc_open(struct inode *inode, struct file *file)
2350 {
2351         return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2352                             sizeof(struct ipmr_mfc_iter));
2353 }
2354
2355 static const struct file_operations ipmr_mfc_fops = {
2356         .owner   = THIS_MODULE,
2357         .open    = ipmr_mfc_open,
2358         .read    = seq_read,
2359         .llseek  = seq_lseek,
2360         .release = seq_release_net,
2361 };
2362 #endif
2363
2364 #ifdef CONFIG_IP_PIMSM_V2
2365 static const struct net_protocol pim_protocol = {
2366         .handler        =       pim_rcv,
2367         .netns_ok       =       1,
2368 };
2369 #endif
2370
2371
2372 /*
2373  *      Setup for IP multicast routing
2374  */
2375 static int __net_init ipmr_net_init(struct net *net)
2376 {
2377         int err;
2378
2379         err = ipmr_rules_init(net);
2380         if (err < 0)
2381                 goto fail;
2382
2383 #ifdef CONFIG_PROC_FS
2384         err = -ENOMEM;
2385         if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2386                 goto proc_vif_fail;
2387         if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2388                 goto proc_cache_fail;
2389 #endif
2390         return 0;
2391
2392 #ifdef CONFIG_PROC_FS
2393 proc_cache_fail:
2394         proc_net_remove(net, "ip_mr_vif");
2395 proc_vif_fail:
2396         ipmr_rules_exit(net);
2397 #endif
2398 fail:
2399         return err;
2400 }
2401
2402 static void __net_exit ipmr_net_exit(struct net *net)
2403 {
2404 #ifdef CONFIG_PROC_FS
2405         proc_net_remove(net, "ip_mr_cache");
2406         proc_net_remove(net, "ip_mr_vif");
2407 #endif
2408         ipmr_rules_exit(net);
2409 }
2410
2411 static struct pernet_operations ipmr_net_ops = {
2412         .init = ipmr_net_init,
2413         .exit = ipmr_net_exit,
2414 };
2415
2416 int __init ip_mr_init(void)
2417 {
2418         int err;
2419
2420         mrt_cachep = kmem_cache_create("ip_mrt_cache",
2421                                        sizeof(struct mfc_cache),
2422                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2423                                        NULL);
2424         if (!mrt_cachep)
2425                 return -ENOMEM;
2426
2427         err = register_pernet_subsys(&ipmr_net_ops);
2428         if (err)
2429                 goto reg_pernet_fail;
2430
2431         err = register_netdevice_notifier(&ip_mr_notifier);
2432         if (err)
2433                 goto reg_notif_fail;
2434 #ifdef CONFIG_IP_PIMSM_V2
2435         if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2436                 printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2437                 err = -EAGAIN;
2438                 goto add_proto_fail;
2439         }
2440 #endif
2441         rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2442         return 0;
2443
2444 #ifdef CONFIG_IP_PIMSM_V2
2445 add_proto_fail:
2446         unregister_netdevice_notifier(&ip_mr_notifier);
2447 #endif
2448 reg_notif_fail:
2449         unregister_pernet_subsys(&ipmr_net_ops);
2450 reg_pernet_fail:
2451         kmem_cache_destroy(mrt_cachep);
2452         return err;
2453 }