2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
36 #include <net/protocol.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
69 Current solution: t->recursion lock breaks dead loops. It looks
70 like dev->tbusy flag, but I preferred new variable, because
71 the semantics is different. One day, when hard_start_xmit
72 will be multithreaded we will have to use skb->encapsulation.
76 2. Networking dead loops would not kill routers, but would really
77 kill network. IP hop limit plays role of "t->recursion" in this case,
78 if we copy it from packet being encapsulated to upper header.
79 It is very good solution, but it introduces two problems:
81 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82 do not work over tunnels.
83 - traceroute does not work. I planned to relay ICMP from tunnel,
84 so that this problem would be solved and traceroute output
85 would even more informative. This idea appeared to be wrong:
86 only Linux complies to rfc1812 now (yes, guys, Linux is the only
87 true router now :-)), all routers (at least, in neighbourhood of mine)
88 return only 8 bytes of payload. It is the end.
90 Hence, if we want that OSPF worked or traceroute said something reasonable,
91 we should search for another solution.
93 One of them is to parse packet trying to detect inner encapsulation
94 made by our node. It is difficult or even impossible, especially,
95 taking into account fragmentation. TO be short, tt is not solution at all.
97 Current solution: The solution was UNEXPECTEDLY SIMPLE.
98 We force DF flag on tunnels with preconfigured hop limit,
99 that is ALL. :-) Well, it does not remove the problem completely,
100 but exponential growth of network traffic is changed to linear
101 (branches, that exceed pmtu are pruned) and tunnel mtu
102 fastly degrades to value <68, where looping stops.
103 Yes, it is not good if there exists a router in the loop,
104 which does not force DF, even when encapsulating packets have DF set.
105 But it is not our problem! Nobody could accuse us, we made
106 all that we could make. Even if it is your gated who injected
107 fatal route to network, even if it were you who configured
108 fatal static route: you are innocent. :-)
112 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113 practically identical code. It would be good to glue them
114 together, but it is not very evident, how to make them modular.
115 sit is integral part of IPv6, ipip and gre are naturally modular.
116 We could extract common parts (hash table, ioctl etc)
117 to a separate module (ip_tunnel.c).
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127 /* Fallback tunnel: no source, no destination, no key, no options */
131 static int ipgre_net_id;
133 struct ip_tunnel *tunnels[4][HASH_SIZE];
135 struct net_device *fb_tunnel_dev;
138 /* Tunnel hash table */
148 We require exact key match i.e. if a key is present in packet
149 it will match only tunnel with the same key; if it is not present,
150 it will match only keyless tunnel.
152 All keysless packets, if not matched configured keyless tunnels
153 will match fallback tunnel.
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158 #define tunnels_r_l tunnels[3]
159 #define tunnels_r tunnels[2]
160 #define tunnels_l tunnels[1]
161 #define tunnels_wc tunnels[0]
163 static DEFINE_RWLOCK(ipgre_lock);
165 /* Given src, dst and key, find appropriate for input tunnel. */
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168 __be32 remote, __be32 local,
169 __be32 key, __be16 gre_proto)
171 struct net *net = dev_net(dev);
172 int link = dev->ifindex;
173 unsigned h0 = HASH(remote);
174 unsigned h1 = HASH(key);
175 struct ip_tunnel *t, *sel[4] = { NULL, NULL, NULL, NULL };
176 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178 ARPHRD_ETHER : ARPHRD_IPGRE;
181 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182 if (local != t->parms.iph.saddr ||
183 remote != t->parms.iph.daddr ||
184 key != t->parms.i_key ||
185 !(t->dev->flags & IFF_UP))
188 if (t->dev->type != ARPHRD_IPGRE &&
189 t->dev->type != dev_type)
193 if (t->parms.link != link)
195 if (t->dev->type != dev_type)
199 if (sel[idx] == NULL)
203 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
204 if (remote != t->parms.iph.daddr ||
205 key != t->parms.i_key ||
206 !(t->dev->flags & IFF_UP))
209 if (t->dev->type != ARPHRD_IPGRE &&
210 t->dev->type != dev_type)
214 if (t->parms.link != link)
216 if (t->dev->type != dev_type)
220 if (sel[idx] == NULL)
224 for (t = ign->tunnels_l[h1]; t; t = t->next) {
225 if ((local != t->parms.iph.saddr &&
226 (local != t->parms.iph.daddr ||
227 !ipv4_is_multicast(local))) ||
228 key != t->parms.i_key ||
229 !(t->dev->flags & IFF_UP))
232 if (t->dev->type != ARPHRD_IPGRE &&
233 t->dev->type != dev_type)
237 if (t->parms.link != link)
239 if (t->dev->type != dev_type)
243 if (sel[idx] == NULL)
247 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
248 if (t->parms.i_key != key ||
249 !(t->dev->flags & IFF_UP))
252 if (t->dev->type != ARPHRD_IPGRE &&
253 t->dev->type != dev_type)
257 if (t->parms.link != link)
259 if (t->dev->type != dev_type)
263 if (sel[idx] == NULL)
267 for (idx = 1; idx < ARRAY_SIZE(sel); idx++)
268 if (sel[idx] != NULL)
271 if (ign->fb_tunnel_dev->flags & IFF_UP)
272 return netdev_priv(ign->fb_tunnel_dev);
277 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
278 struct ip_tunnel_parm *parms)
280 __be32 remote = parms->iph.daddr;
281 __be32 local = parms->iph.saddr;
282 __be32 key = parms->i_key;
283 unsigned h = HASH(key);
288 if (remote && !ipv4_is_multicast(remote)) {
293 return &ign->tunnels[prio][h];
296 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
299 return __ipgre_bucket(ign, &t->parms);
302 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
304 struct ip_tunnel **tp = ipgre_bucket(ign, t);
307 write_lock_bh(&ipgre_lock);
309 write_unlock_bh(&ipgre_lock);
312 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
314 struct ip_tunnel **tp;
316 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
318 write_lock_bh(&ipgre_lock);
320 write_unlock_bh(&ipgre_lock);
326 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
327 struct ip_tunnel_parm *parms,
330 __be32 remote = parms->iph.daddr;
331 __be32 local = parms->iph.saddr;
332 __be32 key = parms->i_key;
333 int link = parms->link;
334 struct ip_tunnel *t, **tp;
335 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
337 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
338 if (local == t->parms.iph.saddr &&
339 remote == t->parms.iph.daddr &&
340 key == t->parms.i_key &&
341 link == t->parms.link &&
342 type == t->dev->type)
348 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
349 struct ip_tunnel_parm *parms, int create)
351 struct ip_tunnel *t, *nt;
352 struct net_device *dev;
354 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
356 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
361 strlcpy(name, parms->name, IFNAMSIZ);
363 sprintf(name, "gre%%d");
365 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
369 dev_net_set(dev, net);
371 if (strchr(name, '%')) {
372 if (dev_alloc_name(dev, name) < 0)
376 nt = netdev_priv(dev);
378 dev->rtnl_link_ops = &ipgre_link_ops;
380 dev->mtu = ipgre_tunnel_bind_dev(dev);
382 if (register_netdevice(dev) < 0)
386 ipgre_tunnel_link(ign, nt);
394 static void ipgre_tunnel_uninit(struct net_device *dev)
396 struct net *net = dev_net(dev);
397 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
399 ipgre_tunnel_unlink(ign, netdev_priv(dev));
404 static void ipgre_err(struct sk_buff *skb, u32 info)
407 /* All the routers (except for Linux) return only
408 8 bytes of packet payload. It means, that precise relaying of
409 ICMP in the real Internet is absolutely infeasible.
411 Moreover, Cisco "wise men" put GRE key to the third word
412 in GRE header. It makes impossible maintaining even soft state for keyed
413 GRE tunnels with enabled checksum. Tell them "thank you".
415 Well, I wonder, rfc1812 was written by Cisco employee,
416 what the hell these idiots break standrads established
420 struct iphdr *iph = (struct iphdr *)skb->data;
421 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
422 int grehlen = (iph->ihl<<2) + 4;
423 const int type = icmp_hdr(skb)->type;
424 const int code = icmp_hdr(skb)->code;
429 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
430 if (flags&(GRE_VERSION|GRE_ROUTING))
439 /* If only 8 bytes returned, keyed message will be dropped here */
440 if (skb_headlen(skb) < grehlen)
445 case ICMP_PARAMETERPROB:
448 case ICMP_DEST_UNREACH:
451 case ICMP_PORT_UNREACH:
452 /* Impossible event. */
454 case ICMP_FRAG_NEEDED:
455 /* Soft state for pmtu is maintained by IP core. */
458 /* All others are translated to HOST_UNREACH.
459 rfc2003 contains "deep thoughts" about NET_UNREACH,
460 I believe they are just ether pollution. --ANK
465 case ICMP_TIME_EXCEEDED:
466 if (code != ICMP_EXC_TTL)
471 read_lock(&ipgre_lock);
472 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
474 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
476 if (t == NULL || t->parms.iph.daddr == 0 ||
477 ipv4_is_multicast(t->parms.iph.daddr))
480 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
483 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
487 t->err_time = jiffies;
489 read_unlock(&ipgre_lock);
493 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
495 if (INET_ECN_is_ce(iph->tos)) {
496 if (skb->protocol == htons(ETH_P_IP)) {
497 IP_ECN_set_ce(ip_hdr(skb));
498 } else if (skb->protocol == htons(ETH_P_IPV6)) {
499 IP6_ECN_set_ce(ipv6_hdr(skb));
505 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
508 if (skb->protocol == htons(ETH_P_IP))
509 inner = old_iph->tos;
510 else if (skb->protocol == htons(ETH_P_IPV6))
511 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
512 return INET_ECN_encapsulate(tos, inner);
515 static int ipgre_rcv(struct sk_buff *skb)
523 struct ip_tunnel *tunnel;
528 if (!pskb_may_pull(skb, 16))
535 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
536 /* - Version must be 0.
537 - We do not support routing headers.
539 if (flags&(GRE_VERSION|GRE_ROUTING))
542 if (flags&GRE_CSUM) {
543 switch (skb->ip_summed) {
544 case CHECKSUM_COMPLETE:
545 csum = csum_fold(skb->csum);
551 csum = __skb_checksum_complete(skb);
552 skb->ip_summed = CHECKSUM_COMPLETE;
557 key = *(__be32*)(h + offset);
561 seqno = ntohl(*(__be32*)(h + offset));
566 gre_proto = *(__be16 *)(h + 2);
568 read_lock(&ipgre_lock);
569 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
570 iph->saddr, iph->daddr, key,
572 struct net_device_stats *stats = &tunnel->dev->stats;
576 skb->protocol = gre_proto;
577 /* WCCP version 1 and 2 protocol decoding.
578 * - Change protocol to IP
579 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
581 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
582 skb->protocol = htons(ETH_P_IP);
583 if ((*(h + offset) & 0xF0) != 0x40)
587 skb->mac_header = skb->network_header;
588 __pskb_pull(skb, offset);
589 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
590 skb->pkt_type = PACKET_HOST;
591 #ifdef CONFIG_NET_IPGRE_BROADCAST
592 if (ipv4_is_multicast(iph->daddr)) {
593 /* Looped back packet, drop it! */
594 if (skb->rtable->fl.iif == 0)
597 skb->pkt_type = PACKET_BROADCAST;
601 if (((flags&GRE_CSUM) && csum) ||
602 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
603 stats->rx_crc_errors++;
607 if (tunnel->parms.i_flags&GRE_SEQ) {
608 if (!(flags&GRE_SEQ) ||
609 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
610 stats->rx_fifo_errors++;
614 tunnel->i_seqno = seqno + 1;
619 /* Warning: All skb pointers will be invalidated! */
620 if (tunnel->dev->type == ARPHRD_ETHER) {
621 if (!pskb_may_pull(skb, ETH_HLEN)) {
622 stats->rx_length_errors++;
628 skb->protocol = eth_type_trans(skb, tunnel->dev);
629 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
633 stats->rx_bytes += len;
634 skb->dev = tunnel->dev;
635 dst_release(skb->dst);
639 skb_reset_network_header(skb);
640 ipgre_ecn_decapsulate(iph, skb);
643 read_unlock(&ipgre_lock);
646 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
649 read_unlock(&ipgre_lock);
655 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
657 struct ip_tunnel *tunnel = netdev_priv(dev);
658 struct net_device_stats *stats = &tunnel->dev->stats;
659 struct iphdr *old_iph = ip_hdr(skb);
663 struct rtable *rt; /* Route to the other host */
664 struct net_device *tdev; /* Device to other host */
665 struct iphdr *iph; /* Our new IP header */
666 unsigned int max_headroom; /* The extra header space needed */
671 if (tunnel->recursion++) {
676 if (dev->type == ARPHRD_ETHER)
677 IPCB(skb)->flags = 0;
679 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
681 tiph = (struct iphdr *)skb->data;
683 gre_hlen = tunnel->hlen;
684 tiph = &tunnel->parms.iph;
687 if ((dst = tiph->daddr) == 0) {
690 if (skb->dst == NULL) {
691 stats->tx_fifo_errors++;
695 if (skb->protocol == htons(ETH_P_IP)) {
697 if ((dst = rt->rt_gateway) == 0)
701 else if (skb->protocol == htons(ETH_P_IPV6)) {
702 struct in6_addr *addr6;
704 struct neighbour *neigh = skb->dst->neighbour;
709 addr6 = (struct in6_addr *)&neigh->primary_key;
710 addr_type = ipv6_addr_type(addr6);
712 if (addr_type == IPV6_ADDR_ANY) {
713 addr6 = &ipv6_hdr(skb)->daddr;
714 addr_type = ipv6_addr_type(addr6);
717 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
720 dst = addr6->s6_addr32[3];
729 if (skb->protocol == htons(ETH_P_IP))
735 struct flowi fl = { .oif = tunnel->parms.link,
738 .saddr = tiph->saddr,
739 .tos = RT_TOS(tos) } },
740 .proto = IPPROTO_GRE };
741 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
742 stats->tx_carrier_errors++;
746 tdev = rt->u.dst.dev;
756 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
758 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
761 skb->dst->ops->update_pmtu(skb->dst, mtu);
763 if (skb->protocol == htons(ETH_P_IP)) {
764 df |= (old_iph->frag_off&htons(IP_DF));
766 if ((old_iph->frag_off&htons(IP_DF)) &&
767 mtu < ntohs(old_iph->tot_len)) {
768 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
774 else if (skb->protocol == htons(ETH_P_IPV6)) {
775 struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
777 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
778 if ((tunnel->parms.iph.daddr &&
779 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
780 rt6->rt6i_dst.plen == 128) {
781 rt6->rt6i_flags |= RTF_MODIFIED;
782 skb->dst->metrics[RTAX_MTU-1] = mtu;
786 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
787 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
794 if (tunnel->err_count > 0) {
795 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
798 dst_link_failure(skb);
800 tunnel->err_count = 0;
803 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
805 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
806 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
807 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
816 skb_set_owner_w(new_skb, skb->sk);
819 old_iph = ip_hdr(skb);
822 skb_reset_transport_header(skb);
823 skb_push(skb, gre_hlen);
824 skb_reset_network_header(skb);
825 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
826 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
828 dst_release(skb->dst);
829 skb->dst = &rt->u.dst;
832 * Push down and install the IPIP header.
837 iph->ihl = sizeof(struct iphdr) >> 2;
839 iph->protocol = IPPROTO_GRE;
840 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
841 iph->daddr = rt->rt_dst;
842 iph->saddr = rt->rt_src;
844 if ((iph->ttl = tiph->ttl) == 0) {
845 if (skb->protocol == htons(ETH_P_IP))
846 iph->ttl = old_iph->ttl;
848 else if (skb->protocol == htons(ETH_P_IPV6))
849 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
852 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
855 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
856 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
857 htons(ETH_P_TEB) : skb->protocol;
859 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
860 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
862 if (tunnel->parms.o_flags&GRE_SEQ) {
864 *ptr = htonl(tunnel->o_seqno);
867 if (tunnel->parms.o_flags&GRE_KEY) {
868 *ptr = tunnel->parms.o_key;
871 if (tunnel->parms.o_flags&GRE_CSUM) {
873 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
884 dst_link_failure(skb);
893 static int ipgre_tunnel_bind_dev(struct net_device *dev)
895 struct net_device *tdev = NULL;
896 struct ip_tunnel *tunnel;
898 int hlen = LL_MAX_HEADER;
899 int mtu = ETH_DATA_LEN;
900 int addend = sizeof(struct iphdr) + 4;
902 tunnel = netdev_priv(dev);
903 iph = &tunnel->parms.iph;
905 /* Guess output device to choose reasonable mtu and needed_headroom */
908 struct flowi fl = { .oif = tunnel->parms.link,
910 { .daddr = iph->daddr,
912 .tos = RT_TOS(iph->tos) } },
913 .proto = IPPROTO_GRE };
915 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
916 tdev = rt->u.dst.dev;
920 if (dev->type != ARPHRD_ETHER)
921 dev->flags |= IFF_POINTOPOINT;
924 if (!tdev && tunnel->parms.link)
925 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
928 hlen = tdev->hard_header_len + tdev->needed_headroom;
931 dev->iflink = tunnel->parms.link;
933 /* Precalculate GRE options length */
934 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
935 if (tunnel->parms.o_flags&GRE_CSUM)
937 if (tunnel->parms.o_flags&GRE_KEY)
939 if (tunnel->parms.o_flags&GRE_SEQ)
942 dev->needed_headroom = addend + hlen;
943 mtu -= dev->hard_header_len - addend;
948 tunnel->hlen = addend;
954 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
957 struct ip_tunnel_parm p;
959 struct net *net = dev_net(dev);
960 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
965 if (dev == ign->fb_tunnel_dev) {
966 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
970 t = ipgre_tunnel_locate(net, &p, 0);
973 t = netdev_priv(dev);
974 memcpy(&p, &t->parms, sizeof(p));
975 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
982 if (!capable(CAP_NET_ADMIN))
986 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
990 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
991 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
992 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
995 p.iph.frag_off |= htons(IP_DF);
997 if (!(p.i_flags&GRE_KEY))
999 if (!(p.o_flags&GRE_KEY))
1002 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1004 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1006 if (t->dev != dev) {
1011 unsigned nflags = 0;
1013 t = netdev_priv(dev);
1015 if (ipv4_is_multicast(p.iph.daddr))
1016 nflags = IFF_BROADCAST;
1017 else if (p.iph.daddr)
1018 nflags = IFF_POINTOPOINT;
1020 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1024 ipgre_tunnel_unlink(ign, t);
1025 t->parms.iph.saddr = p.iph.saddr;
1026 t->parms.iph.daddr = p.iph.daddr;
1027 t->parms.i_key = p.i_key;
1028 t->parms.o_key = p.o_key;
1029 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1030 memcpy(dev->broadcast, &p.iph.daddr, 4);
1031 ipgre_tunnel_link(ign, t);
1032 netdev_state_change(dev);
1038 if (cmd == SIOCCHGTUNNEL) {
1039 t->parms.iph.ttl = p.iph.ttl;
1040 t->parms.iph.tos = p.iph.tos;
1041 t->parms.iph.frag_off = p.iph.frag_off;
1042 if (t->parms.link != p.link) {
1043 t->parms.link = p.link;
1044 dev->mtu = ipgre_tunnel_bind_dev(dev);
1045 netdev_state_change(dev);
1048 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1051 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1056 if (!capable(CAP_NET_ADMIN))
1059 if (dev == ign->fb_tunnel_dev) {
1061 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1064 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1067 if (t == netdev_priv(ign->fb_tunnel_dev))
1071 unregister_netdevice(dev);
1083 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1085 struct ip_tunnel *tunnel = netdev_priv(dev);
1087 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1093 /* Nice toy. Unfortunately, useless in real life :-)
1094 It allows to construct virtual multiprotocol broadcast "LAN"
1095 over the Internet, provided multicast routing is tuned.
1098 I have no idea was this bicycle invented before me,
1099 so that I had to set ARPHRD_IPGRE to a random value.
1100 I have an impression, that Cisco could make something similar,
1101 but this feature is apparently missing in IOS<=11.2(8).
1103 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1104 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1106 ping -t 255 224.66.66.66
1108 If nobody answers, mbone does not work.
1110 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1111 ip addr add 10.66.66.<somewhat>/24 dev Universe
1112 ifconfig Universe up
1113 ifconfig Universe add fe80::<Your_real_addr>/10
1114 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1117 ftp fec0:6666:6666::193.233.7.65
1122 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1123 unsigned short type,
1124 const void *daddr, const void *saddr, unsigned len)
1126 struct ip_tunnel *t = netdev_priv(dev);
1127 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1128 __be16 *p = (__be16*)(iph+1);
1130 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1131 p[0] = t->parms.o_flags;
1135 * Set the source hardware address.
1139 memcpy(&iph->saddr, saddr, 4);
1142 memcpy(&iph->daddr, daddr, 4);
1145 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1151 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1153 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1154 memcpy(haddr, &iph->saddr, 4);
1158 static const struct header_ops ipgre_header_ops = {
1159 .create = ipgre_header,
1160 .parse = ipgre_header_parse,
1163 #ifdef CONFIG_NET_IPGRE_BROADCAST
1164 static int ipgre_open(struct net_device *dev)
1166 struct ip_tunnel *t = netdev_priv(dev);
1168 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1169 struct flowi fl = { .oif = t->parms.link,
1171 { .daddr = t->parms.iph.daddr,
1172 .saddr = t->parms.iph.saddr,
1173 .tos = RT_TOS(t->parms.iph.tos) } },
1174 .proto = IPPROTO_GRE };
1176 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1177 return -EADDRNOTAVAIL;
1178 dev = rt->u.dst.dev;
1180 if (__in_dev_get_rtnl(dev) == NULL)
1181 return -EADDRNOTAVAIL;
1182 t->mlink = dev->ifindex;
1183 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1188 static int ipgre_close(struct net_device *dev)
1190 struct ip_tunnel *t = netdev_priv(dev);
1192 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1193 struct in_device *in_dev;
1194 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1205 static const struct net_device_ops ipgre_netdev_ops = {
1206 .ndo_init = ipgre_tunnel_init,
1207 .ndo_uninit = ipgre_tunnel_uninit,
1208 #ifdef CONFIG_NET_IPGRE_BROADCAST
1209 .ndo_open = ipgre_open,
1210 .ndo_stop = ipgre_close,
1212 .ndo_start_xmit = ipgre_tunnel_xmit,
1213 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1214 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1217 static void ipgre_tunnel_setup(struct net_device *dev)
1219 dev->netdev_ops = &ipgre_netdev_ops;
1220 dev->destructor = free_netdev;
1222 dev->type = ARPHRD_IPGRE;
1223 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1224 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1225 dev->flags = IFF_NOARP;
1228 dev->features |= NETIF_F_NETNS_LOCAL;
1231 static int ipgre_tunnel_init(struct net_device *dev)
1233 struct ip_tunnel *tunnel;
1236 tunnel = netdev_priv(dev);
1237 iph = &tunnel->parms.iph;
1240 strcpy(tunnel->parms.name, dev->name);
1242 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1243 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1246 #ifdef CONFIG_NET_IPGRE_BROADCAST
1247 if (ipv4_is_multicast(iph->daddr)) {
1250 dev->flags = IFF_BROADCAST;
1251 dev->header_ops = &ipgre_header_ops;
1255 dev->header_ops = &ipgre_header_ops;
1260 static void ipgre_fb_tunnel_init(struct net_device *dev)
1262 struct ip_tunnel *tunnel = netdev_priv(dev);
1263 struct iphdr *iph = &tunnel->parms.iph;
1264 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1267 strcpy(tunnel->parms.name, dev->name);
1270 iph->protocol = IPPROTO_GRE;
1272 tunnel->hlen = sizeof(struct iphdr) + 4;
1275 ign->tunnels_wc[0] = tunnel;
1279 static struct net_protocol ipgre_protocol = {
1280 .handler = ipgre_rcv,
1281 .err_handler = ipgre_err,
1285 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1289 for (prio = 0; prio < 4; prio++) {
1291 for (h = 0; h < HASH_SIZE; h++) {
1292 struct ip_tunnel *t;
1293 while ((t = ign->tunnels[prio][h]) != NULL)
1294 unregister_netdevice(t->dev);
1299 static int ipgre_init_net(struct net *net)
1302 struct ipgre_net *ign;
1305 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1309 err = net_assign_generic(net, ipgre_net_id, ign);
1313 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1314 ipgre_tunnel_setup);
1315 if (!ign->fb_tunnel_dev) {
1319 dev_net_set(ign->fb_tunnel_dev, net);
1321 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1322 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1324 if ((err = register_netdev(ign->fb_tunnel_dev)))
1330 free_netdev(ign->fb_tunnel_dev);
1339 static void ipgre_exit_net(struct net *net)
1341 struct ipgre_net *ign;
1343 ign = net_generic(net, ipgre_net_id);
1345 ipgre_destroy_tunnels(ign);
1350 static struct pernet_operations ipgre_net_ops = {
1351 .init = ipgre_init_net,
1352 .exit = ipgre_exit_net,
1355 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1363 if (data[IFLA_GRE_IFLAGS])
1364 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1365 if (data[IFLA_GRE_OFLAGS])
1366 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1367 if (flags & (GRE_VERSION|GRE_ROUTING))
1373 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1377 if (tb[IFLA_ADDRESS]) {
1378 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1381 return -EADDRNOTAVAIL;
1387 if (data[IFLA_GRE_REMOTE]) {
1388 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1394 return ipgre_tunnel_validate(tb, data);
1397 static void ipgre_netlink_parms(struct nlattr *data[],
1398 struct ip_tunnel_parm *parms)
1400 memset(parms, 0, sizeof(*parms));
1402 parms->iph.protocol = IPPROTO_GRE;
1407 if (data[IFLA_GRE_LINK])
1408 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410 if (data[IFLA_GRE_IFLAGS])
1411 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413 if (data[IFLA_GRE_OFLAGS])
1414 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416 if (data[IFLA_GRE_IKEY])
1417 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419 if (data[IFLA_GRE_OKEY])
1420 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422 if (data[IFLA_GRE_LOCAL])
1423 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425 if (data[IFLA_GRE_REMOTE])
1426 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428 if (data[IFLA_GRE_TTL])
1429 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431 if (data[IFLA_GRE_TOS])
1432 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1435 parms->iph.frag_off = htons(IP_DF);
1438 static int ipgre_tap_init(struct net_device *dev)
1440 struct ip_tunnel *tunnel;
1442 tunnel = netdev_priv(dev);
1445 strcpy(tunnel->parms.name, dev->name);
1447 ipgre_tunnel_bind_dev(dev);
1452 static const struct net_device_ops ipgre_tap_netdev_ops = {
1453 .ndo_init = ipgre_tap_init,
1454 .ndo_uninit = ipgre_tunnel_uninit,
1455 .ndo_start_xmit = ipgre_tunnel_xmit,
1456 .ndo_set_mac_address = eth_mac_addr,
1457 .ndo_validate_addr = eth_validate_addr,
1458 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1461 static void ipgre_tap_setup(struct net_device *dev)
1466 dev->netdev_ops = &ipgre_netdev_ops;
1467 dev->destructor = free_netdev;
1470 dev->features |= NETIF_F_NETNS_LOCAL;
1473 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1474 struct nlattr *data[])
1476 struct ip_tunnel *nt;
1477 struct net *net = dev_net(dev);
1478 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1482 nt = netdev_priv(dev);
1483 ipgre_netlink_parms(data, &nt->parms);
1485 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1488 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1489 random_ether_addr(dev->dev_addr);
1491 mtu = ipgre_tunnel_bind_dev(dev);
1495 err = register_netdevice(dev);
1500 ipgre_tunnel_link(ign, nt);
1506 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1507 struct nlattr *data[])
1509 struct ip_tunnel *t, *nt;
1510 struct net *net = dev_net(dev);
1511 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1512 struct ip_tunnel_parm p;
1515 if (dev == ign->fb_tunnel_dev)
1518 nt = netdev_priv(dev);
1519 ipgre_netlink_parms(data, &p);
1521 t = ipgre_tunnel_locate(net, &p, 0);
1527 unsigned nflags = 0;
1531 if (ipv4_is_multicast(p.iph.daddr))
1532 nflags = IFF_BROADCAST;
1533 else if (p.iph.daddr)
1534 nflags = IFF_POINTOPOINT;
1536 if ((dev->flags ^ nflags) &
1537 (IFF_POINTOPOINT | IFF_BROADCAST))
1540 ipgre_tunnel_unlink(ign, t);
1541 t->parms.iph.saddr = p.iph.saddr;
1542 t->parms.iph.daddr = p.iph.daddr;
1543 t->parms.i_key = p.i_key;
1544 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1545 memcpy(dev->broadcast, &p.iph.daddr, 4);
1546 ipgre_tunnel_link(ign, t);
1547 netdev_state_change(dev);
1550 t->parms.o_key = p.o_key;
1551 t->parms.iph.ttl = p.iph.ttl;
1552 t->parms.iph.tos = p.iph.tos;
1553 t->parms.iph.frag_off = p.iph.frag_off;
1555 if (t->parms.link != p.link) {
1556 t->parms.link = p.link;
1557 mtu = ipgre_tunnel_bind_dev(dev);
1560 netdev_state_change(dev);
1566 static size_t ipgre_get_size(const struct net_device *dev)
1571 /* IFLA_GRE_IFLAGS */
1573 /* IFLA_GRE_OFLAGS */
1579 /* IFLA_GRE_LOCAL */
1581 /* IFLA_GRE_REMOTE */
1587 /* IFLA_GRE_PMTUDISC */
1592 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1594 struct ip_tunnel *t = netdev_priv(dev);
1595 struct ip_tunnel_parm *p = &t->parms;
1597 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1598 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1599 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1600 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1601 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1602 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1603 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1604 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1605 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1606 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1614 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1615 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1616 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1617 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1618 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1619 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1620 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1621 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1622 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1623 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1624 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1627 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1629 .maxtype = IFLA_GRE_MAX,
1630 .policy = ipgre_policy,
1631 .priv_size = sizeof(struct ip_tunnel),
1632 .setup = ipgre_tunnel_setup,
1633 .validate = ipgre_tunnel_validate,
1634 .newlink = ipgre_newlink,
1635 .changelink = ipgre_changelink,
1636 .get_size = ipgre_get_size,
1637 .fill_info = ipgre_fill_info,
1640 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1642 .maxtype = IFLA_GRE_MAX,
1643 .policy = ipgre_policy,
1644 .priv_size = sizeof(struct ip_tunnel),
1645 .setup = ipgre_tap_setup,
1646 .validate = ipgre_tap_validate,
1647 .newlink = ipgre_newlink,
1648 .changelink = ipgre_changelink,
1649 .get_size = ipgre_get_size,
1650 .fill_info = ipgre_fill_info,
1654 * And now the modules code and kernel interface.
1657 static int __init ipgre_init(void)
1661 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1663 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1664 printk(KERN_INFO "ipgre init: can't add protocol\n");
1668 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1670 goto gen_device_failed;
1672 err = rtnl_link_register(&ipgre_link_ops);
1674 goto rtnl_link_failed;
1676 err = rtnl_link_register(&ipgre_tap_ops);
1678 goto tap_ops_failed;
1684 rtnl_link_unregister(&ipgre_link_ops);
1686 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1688 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1692 static void __exit ipgre_fini(void)
1694 rtnl_link_unregister(&ipgre_tap_ops);
1695 rtnl_link_unregister(&ipgre_link_ops);
1696 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1697 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1698 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1701 module_init(ipgre_init);
1702 module_exit(ipgre_fini);
1703 MODULE_LICENSE("GPL");
1704 MODULE_ALIAS_RTNL_LINK("gre");
1705 MODULE_ALIAS_RTNL_LINK("gretap");