2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
37 #include <net/protocol.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
59 1. The most important issue is detecting local dead loops.
60 They would cause complete host lockup in transmit, which
61 would be "resolved" by stack overflow or, if queueing is enabled,
62 with infinite looping in net_bh.
64 We cannot track such dead loops during route installation,
65 it is infeasible task. The most general solutions would be
66 to keep skb->encapsulation counter (sort of local ttl),
67 and silently drop packet when it expires. It is the best
68 solution, but it supposes maintaing new variable in ALL
69 skb, even if no tunneling is used.
71 Current solution: HARD_TX_LOCK lock breaks dead loops.
75 2. Networking dead loops would not kill routers, but would really
76 kill network. IP hop limit plays role of "t->recursion" in this case,
77 if we copy it from packet being encapsulated to upper header.
78 It is very good solution, but it introduces two problems:
80 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81 do not work over tunnels.
82 - traceroute does not work. I planned to relay ICMP from tunnel,
83 so that this problem would be solved and traceroute output
84 would even more informative. This idea appeared to be wrong:
85 only Linux complies to rfc1812 now (yes, guys, Linux is the only
86 true router now :-)), all routers (at least, in neighbourhood of mine)
87 return only 8 bytes of payload. It is the end.
89 Hence, if we want that OSPF worked or traceroute said something reasonable,
90 we should search for another solution.
92 One of them is to parse packet trying to detect inner encapsulation
93 made by our node. It is difficult or even impossible, especially,
94 taking into account fragmentation. TO be short, tt is not solution at all.
96 Current solution: The solution was UNEXPECTEDLY SIMPLE.
97 We force DF flag on tunnels with preconfigured hop limit,
98 that is ALL. :-) Well, it does not remove the problem completely,
99 but exponential growth of network traffic is changed to linear
100 (branches, that exceed pmtu are pruned) and tunnel mtu
101 fastly degrades to value <68, where looping stops.
102 Yes, it is not good if there exists a router in the loop,
103 which does not force DF, even when encapsulating packets have DF set.
104 But it is not our problem! Nobody could accuse us, we made
105 all that we could make. Even if it is your gated who injected
106 fatal route to network, even if it were you who configured
107 fatal static route: you are innocent. :-)
111 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112 practically identical code. It would be good to glue them
113 together, but it is not very evident, how to make them modular.
114 sit is integral part of IPv6, ipip and gre are naturally modular.
115 We could extract common parts (hash table, ioctl etc)
116 to a separate module (ip_tunnel.c).
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126 /* Fallback tunnel: no source, no destination, no key, no options */
130 static int ipgre_net_id __read_mostly;
132 struct ip_tunnel *tunnels[4][HASH_SIZE];
134 struct net_device *fb_tunnel_dev;
137 /* Tunnel hash table */
147 We require exact key match i.e. if a key is present in packet
148 it will match only tunnel with the same key; if it is not present,
149 it will match only keyless tunnel.
151 All keysless packets, if not matched configured keyless tunnels
152 will match fallback tunnel.
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157 #define tunnels_r_l tunnels[3]
158 #define tunnels_r tunnels[2]
159 #define tunnels_l tunnels[1]
160 #define tunnels_wc tunnels[0]
162 * Locking : hash tables are protected by RCU and a spinlock
164 static DEFINE_SPINLOCK(ipgre_lock);
166 #define for_each_ip_tunnel_rcu(start) \
167 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169 /* Given src, dst and key, find appropriate for input tunnel. */
171 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
172 __be32 remote, __be32 local,
173 __be32 key, __be16 gre_proto)
175 struct net *net = dev_net(dev);
176 int link = dev->ifindex;
177 unsigned h0 = HASH(remote);
178 unsigned h1 = HASH(key);
179 struct ip_tunnel *t, *cand = NULL;
180 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
181 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
182 ARPHRD_ETHER : ARPHRD_IPGRE;
183 int score, cand_score = 4;
185 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
186 if (local != t->parms.iph.saddr ||
187 remote != t->parms.iph.daddr ||
188 key != t->parms.i_key ||
189 !(t->dev->flags & IFF_UP))
192 if (t->dev->type != ARPHRD_IPGRE &&
193 t->dev->type != dev_type)
197 if (t->parms.link != link)
199 if (t->dev->type != dev_type)
204 if (score < cand_score) {
210 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
211 if (remote != t->parms.iph.daddr ||
212 key != t->parms.i_key ||
213 !(t->dev->flags & IFF_UP))
216 if (t->dev->type != ARPHRD_IPGRE &&
217 t->dev->type != dev_type)
221 if (t->parms.link != link)
223 if (t->dev->type != dev_type)
228 if (score < cand_score) {
234 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
235 if ((local != t->parms.iph.saddr &&
236 (local != t->parms.iph.daddr ||
237 !ipv4_is_multicast(local))) ||
238 key != t->parms.i_key ||
239 !(t->dev->flags & IFF_UP))
242 if (t->dev->type != ARPHRD_IPGRE &&
243 t->dev->type != dev_type)
247 if (t->parms.link != link)
249 if (t->dev->type != dev_type)
254 if (score < cand_score) {
260 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
261 if (t->parms.i_key != key ||
262 !(t->dev->flags & IFF_UP))
265 if (t->dev->type != ARPHRD_IPGRE &&
266 t->dev->type != dev_type)
270 if (t->parms.link != link)
272 if (t->dev->type != dev_type)
277 if (score < cand_score) {
286 dev = ign->fb_tunnel_dev;
287 if (dev->flags & IFF_UP)
288 return netdev_priv(dev);
293 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
294 struct ip_tunnel_parm *parms)
296 __be32 remote = parms->iph.daddr;
297 __be32 local = parms->iph.saddr;
298 __be32 key = parms->i_key;
299 unsigned h = HASH(key);
304 if (remote && !ipv4_is_multicast(remote)) {
309 return &ign->tunnels[prio][h];
312 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
315 return __ipgre_bucket(ign, &t->parms);
318 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
320 struct ip_tunnel **tp = ipgre_bucket(ign, t);
322 spin_lock_bh(&ipgre_lock);
324 rcu_assign_pointer(*tp, t);
325 spin_unlock_bh(&ipgre_lock);
328 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
330 struct ip_tunnel **tp;
332 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
334 spin_lock_bh(&ipgre_lock);
336 spin_unlock_bh(&ipgre_lock);
342 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
343 struct ip_tunnel_parm *parms,
346 __be32 remote = parms->iph.daddr;
347 __be32 local = parms->iph.saddr;
348 __be32 key = parms->i_key;
349 int link = parms->link;
350 struct ip_tunnel *t, **tp;
351 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
353 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
354 if (local == t->parms.iph.saddr &&
355 remote == t->parms.iph.daddr &&
356 key == t->parms.i_key &&
357 link == t->parms.link &&
358 type == t->dev->type)
364 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
365 struct ip_tunnel_parm *parms, int create)
367 struct ip_tunnel *t, *nt;
368 struct net_device *dev;
370 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
372 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
377 strlcpy(name, parms->name, IFNAMSIZ);
379 sprintf(name, "gre%%d");
381 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
385 dev_net_set(dev, net);
387 if (strchr(name, '%')) {
388 if (dev_alloc_name(dev, name) < 0)
392 nt = netdev_priv(dev);
394 dev->rtnl_link_ops = &ipgre_link_ops;
396 dev->mtu = ipgre_tunnel_bind_dev(dev);
398 if (register_netdevice(dev) < 0)
402 ipgre_tunnel_link(ign, nt);
410 static void ipgre_tunnel_uninit(struct net_device *dev)
412 struct net *net = dev_net(dev);
413 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
415 ipgre_tunnel_unlink(ign, netdev_priv(dev));
420 static void ipgre_err(struct sk_buff *skb, u32 info)
423 /* All the routers (except for Linux) return only
424 8 bytes of packet payload. It means, that precise relaying of
425 ICMP in the real Internet is absolutely infeasible.
427 Moreover, Cisco "wise men" put GRE key to the third word
428 in GRE header. It makes impossible maintaining even soft state for keyed
429 GRE tunnels with enabled checksum. Tell them "thank you".
431 Well, I wonder, rfc1812 was written by Cisco employee,
432 what the hell these idiots break standrads established
436 struct iphdr *iph = (struct iphdr *)skb->data;
437 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
438 int grehlen = (iph->ihl<<2) + 4;
439 const int type = icmp_hdr(skb)->type;
440 const int code = icmp_hdr(skb)->code;
445 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
446 if (flags&(GRE_VERSION|GRE_ROUTING))
455 /* If only 8 bytes returned, keyed message will be dropped here */
456 if (skb_headlen(skb) < grehlen)
461 case ICMP_PARAMETERPROB:
464 case ICMP_DEST_UNREACH:
467 case ICMP_PORT_UNREACH:
468 /* Impossible event. */
470 case ICMP_FRAG_NEEDED:
471 /* Soft state for pmtu is maintained by IP core. */
474 /* All others are translated to HOST_UNREACH.
475 rfc2003 contains "deep thoughts" about NET_UNREACH,
476 I believe they are just ether pollution. --ANK
481 case ICMP_TIME_EXCEEDED:
482 if (code != ICMP_EXC_TTL)
488 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
490 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
492 if (t == NULL || t->parms.iph.daddr == 0 ||
493 ipv4_is_multicast(t->parms.iph.daddr))
496 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
499 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
503 t->err_time = jiffies;
508 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
510 if (INET_ECN_is_ce(iph->tos)) {
511 if (skb->protocol == htons(ETH_P_IP)) {
512 IP_ECN_set_ce(ip_hdr(skb));
513 } else if (skb->protocol == htons(ETH_P_IPV6)) {
514 IP6_ECN_set_ce(ipv6_hdr(skb));
520 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
523 if (skb->protocol == htons(ETH_P_IP))
524 inner = old_iph->tos;
525 else if (skb->protocol == htons(ETH_P_IPV6))
526 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
527 return INET_ECN_encapsulate(tos, inner);
530 static int ipgre_rcv(struct sk_buff *skb)
538 struct ip_tunnel *tunnel;
542 if (!pskb_may_pull(skb, 16))
549 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550 /* - Version must be 0.
551 - We do not support routing headers.
553 if (flags&(GRE_VERSION|GRE_ROUTING))
556 if (flags&GRE_CSUM) {
557 switch (skb->ip_summed) {
558 case CHECKSUM_COMPLETE:
559 csum = csum_fold(skb->csum);
565 csum = __skb_checksum_complete(skb);
566 skb->ip_summed = CHECKSUM_COMPLETE;
571 key = *(__be32*)(h + offset);
575 seqno = ntohl(*(__be32*)(h + offset));
580 gre_proto = *(__be16 *)(h + 2);
583 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
584 iph->saddr, iph->daddr, key,
586 struct net_device_stats *stats = &tunnel->dev->stats;
590 skb->protocol = gre_proto;
591 /* WCCP version 1 and 2 protocol decoding.
592 * - Change protocol to IP
593 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
595 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
596 skb->protocol = htons(ETH_P_IP);
597 if ((*(h + offset) & 0xF0) != 0x40)
601 skb->mac_header = skb->network_header;
602 __pskb_pull(skb, offset);
603 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
604 skb->pkt_type = PACKET_HOST;
605 #ifdef CONFIG_NET_IPGRE_BROADCAST
606 if (ipv4_is_multicast(iph->daddr)) {
607 /* Looped back packet, drop it! */
608 if (skb_rtable(skb)->fl.iif == 0)
611 skb->pkt_type = PACKET_BROADCAST;
615 if (((flags&GRE_CSUM) && csum) ||
616 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
617 stats->rx_crc_errors++;
621 if (tunnel->parms.i_flags&GRE_SEQ) {
622 if (!(flags&GRE_SEQ) ||
623 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
624 stats->rx_fifo_errors++;
628 tunnel->i_seqno = seqno + 1;
631 /* Warning: All skb pointers will be invalidated! */
632 if (tunnel->dev->type == ARPHRD_ETHER) {
633 if (!pskb_may_pull(skb, ETH_HLEN)) {
634 stats->rx_length_errors++;
640 skb->protocol = eth_type_trans(skb, tunnel->dev);
641 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
644 skb_tunnel_rx(skb, tunnel->dev);
646 skb_reset_network_header(skb);
647 ipgre_ecn_decapsulate(iph, skb);
653 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
664 struct ip_tunnel *tunnel = netdev_priv(dev);
665 struct net_device_stats *stats = &dev->stats;
666 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
667 struct iphdr *old_iph = ip_hdr(skb);
671 struct rtable *rt; /* Route to the other host */
672 struct net_device *tdev; /* Device to other host */
673 struct iphdr *iph; /* Our new IP header */
674 unsigned int max_headroom; /* The extra header space needed */
679 if (dev->type == ARPHRD_ETHER)
680 IPCB(skb)->flags = 0;
682 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
684 tiph = (struct iphdr *)skb->data;
686 gre_hlen = tunnel->hlen;
687 tiph = &tunnel->parms.iph;
690 if ((dst = tiph->daddr) == 0) {
693 if (skb_dst(skb) == NULL) {
694 stats->tx_fifo_errors++;
698 if (skb->protocol == htons(ETH_P_IP)) {
699 rt = skb_rtable(skb);
700 if ((dst = rt->rt_gateway) == 0)
704 else if (skb->protocol == htons(ETH_P_IPV6)) {
705 struct in6_addr *addr6;
707 struct neighbour *neigh = skb_dst(skb)->neighbour;
712 addr6 = (struct in6_addr *)&neigh->primary_key;
713 addr_type = ipv6_addr_type(addr6);
715 if (addr_type == IPV6_ADDR_ANY) {
716 addr6 = &ipv6_hdr(skb)->daddr;
717 addr_type = ipv6_addr_type(addr6);
720 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
723 dst = addr6->s6_addr32[3];
733 if (skb->protocol == htons(ETH_P_IP))
735 else if (skb->protocol == htons(ETH_P_IPV6))
736 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
740 struct flowi fl = { .oif = tunnel->parms.link,
743 .saddr = tiph->saddr,
744 .tos = RT_TOS(tos) } },
745 .proto = IPPROTO_GRE };
746 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
747 stats->tx_carrier_errors++;
761 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
763 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
766 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
768 if (skb->protocol == htons(ETH_P_IP)) {
769 df |= (old_iph->frag_off&htons(IP_DF));
771 if ((old_iph->frag_off&htons(IP_DF)) &&
772 mtu < ntohs(old_iph->tot_len)) {
773 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
779 else if (skb->protocol == htons(ETH_P_IPV6)) {
780 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
782 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
783 if ((tunnel->parms.iph.daddr &&
784 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
785 rt6->rt6i_dst.plen == 128) {
786 rt6->rt6i_flags |= RTF_MODIFIED;
787 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
791 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
792 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
799 if (tunnel->err_count > 0) {
800 if (time_before(jiffies,
801 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
804 dst_link_failure(skb);
806 tunnel->err_count = 0;
809 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
811 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
812 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
813 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
814 if (max_headroom > dev->needed_headroom)
815 dev->needed_headroom = max_headroom;
823 skb_set_owner_w(new_skb, skb->sk);
826 old_iph = ip_hdr(skb);
829 skb_reset_transport_header(skb);
830 skb_push(skb, gre_hlen);
831 skb_reset_network_header(skb);
832 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
833 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
836 skb_dst_set(skb, &rt->dst);
839 * Push down and install the IPIP header.
844 iph->ihl = sizeof(struct iphdr) >> 2;
846 iph->protocol = IPPROTO_GRE;
847 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
848 iph->daddr = rt->rt_dst;
849 iph->saddr = rt->rt_src;
851 if ((iph->ttl = tiph->ttl) == 0) {
852 if (skb->protocol == htons(ETH_P_IP))
853 iph->ttl = old_iph->ttl;
855 else if (skb->protocol == htons(ETH_P_IPV6))
856 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
859 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
862 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
863 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
864 htons(ETH_P_TEB) : skb->protocol;
866 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
867 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
869 if (tunnel->parms.o_flags&GRE_SEQ) {
871 *ptr = htonl(tunnel->o_seqno);
874 if (tunnel->parms.o_flags&GRE_KEY) {
875 *ptr = tunnel->parms.o_key;
878 if (tunnel->parms.o_flags&GRE_CSUM) {
880 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
890 dst_link_failure(skb);
898 static int ipgre_tunnel_bind_dev(struct net_device *dev)
900 struct net_device *tdev = NULL;
901 struct ip_tunnel *tunnel;
903 int hlen = LL_MAX_HEADER;
904 int mtu = ETH_DATA_LEN;
905 int addend = sizeof(struct iphdr) + 4;
907 tunnel = netdev_priv(dev);
908 iph = &tunnel->parms.iph;
910 /* Guess output device to choose reasonable mtu and needed_headroom */
913 struct flowi fl = { .oif = tunnel->parms.link,
915 { .daddr = iph->daddr,
917 .tos = RT_TOS(iph->tos) } },
918 .proto = IPPROTO_GRE };
920 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
925 if (dev->type != ARPHRD_ETHER)
926 dev->flags |= IFF_POINTOPOINT;
929 if (!tdev && tunnel->parms.link)
930 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
933 hlen = tdev->hard_header_len + tdev->needed_headroom;
936 dev->iflink = tunnel->parms.link;
938 /* Precalculate GRE options length */
939 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
940 if (tunnel->parms.o_flags&GRE_CSUM)
942 if (tunnel->parms.o_flags&GRE_KEY)
944 if (tunnel->parms.o_flags&GRE_SEQ)
947 dev->needed_headroom = addend + hlen;
948 mtu -= dev->hard_header_len + addend;
953 tunnel->hlen = addend;
959 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
962 struct ip_tunnel_parm p;
964 struct net *net = dev_net(dev);
965 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
970 if (dev == ign->fb_tunnel_dev) {
971 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
975 t = ipgre_tunnel_locate(net, &p, 0);
978 t = netdev_priv(dev);
979 memcpy(&p, &t->parms, sizeof(p));
980 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
987 if (!capable(CAP_NET_ADMIN))
991 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
995 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
996 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
997 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1000 p.iph.frag_off |= htons(IP_DF);
1002 if (!(p.i_flags&GRE_KEY))
1004 if (!(p.o_flags&GRE_KEY))
1007 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1009 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1011 if (t->dev != dev) {
1016 unsigned nflags = 0;
1018 t = netdev_priv(dev);
1020 if (ipv4_is_multicast(p.iph.daddr))
1021 nflags = IFF_BROADCAST;
1022 else if (p.iph.daddr)
1023 nflags = IFF_POINTOPOINT;
1025 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1029 ipgre_tunnel_unlink(ign, t);
1030 t->parms.iph.saddr = p.iph.saddr;
1031 t->parms.iph.daddr = p.iph.daddr;
1032 t->parms.i_key = p.i_key;
1033 t->parms.o_key = p.o_key;
1034 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1035 memcpy(dev->broadcast, &p.iph.daddr, 4);
1036 ipgre_tunnel_link(ign, t);
1037 netdev_state_change(dev);
1043 if (cmd == SIOCCHGTUNNEL) {
1044 t->parms.iph.ttl = p.iph.ttl;
1045 t->parms.iph.tos = p.iph.tos;
1046 t->parms.iph.frag_off = p.iph.frag_off;
1047 if (t->parms.link != p.link) {
1048 t->parms.link = p.link;
1049 dev->mtu = ipgre_tunnel_bind_dev(dev);
1050 netdev_state_change(dev);
1053 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1056 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1061 if (!capable(CAP_NET_ADMIN))
1064 if (dev == ign->fb_tunnel_dev) {
1066 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1069 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1072 if (t == netdev_priv(ign->fb_tunnel_dev))
1076 unregister_netdevice(dev);
1088 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1090 struct ip_tunnel *tunnel = netdev_priv(dev);
1092 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1098 /* Nice toy. Unfortunately, useless in real life :-)
1099 It allows to construct virtual multiprotocol broadcast "LAN"
1100 over the Internet, provided multicast routing is tuned.
1103 I have no idea was this bicycle invented before me,
1104 so that I had to set ARPHRD_IPGRE to a random value.
1105 I have an impression, that Cisco could make something similar,
1106 but this feature is apparently missing in IOS<=11.2(8).
1108 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1109 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1111 ping -t 255 224.66.66.66
1113 If nobody answers, mbone does not work.
1115 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1116 ip addr add 10.66.66.<somewhat>/24 dev Universe
1117 ifconfig Universe up
1118 ifconfig Universe add fe80::<Your_real_addr>/10
1119 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1122 ftp fec0:6666:6666::193.233.7.65
1127 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1128 unsigned short type,
1129 const void *daddr, const void *saddr, unsigned len)
1131 struct ip_tunnel *t = netdev_priv(dev);
1132 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1133 __be16 *p = (__be16*)(iph+1);
1135 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1136 p[0] = t->parms.o_flags;
1140 * Set the source hardware address.
1144 memcpy(&iph->saddr, saddr, 4);
1146 memcpy(&iph->daddr, daddr, 4);
1153 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1155 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1156 memcpy(haddr, &iph->saddr, 4);
1160 static const struct header_ops ipgre_header_ops = {
1161 .create = ipgre_header,
1162 .parse = ipgre_header_parse,
1165 #ifdef CONFIG_NET_IPGRE_BROADCAST
1166 static int ipgre_open(struct net_device *dev)
1168 struct ip_tunnel *t = netdev_priv(dev);
1170 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1171 struct flowi fl = { .oif = t->parms.link,
1173 { .daddr = t->parms.iph.daddr,
1174 .saddr = t->parms.iph.saddr,
1175 .tos = RT_TOS(t->parms.iph.tos) } },
1176 .proto = IPPROTO_GRE };
1178 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1179 return -EADDRNOTAVAIL;
1182 if (__in_dev_get_rtnl(dev) == NULL)
1183 return -EADDRNOTAVAIL;
1184 t->mlink = dev->ifindex;
1185 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1190 static int ipgre_close(struct net_device *dev)
1192 struct ip_tunnel *t = netdev_priv(dev);
1194 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1195 struct in_device *in_dev;
1196 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1198 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1207 static const struct net_device_ops ipgre_netdev_ops = {
1208 .ndo_init = ipgre_tunnel_init,
1209 .ndo_uninit = ipgre_tunnel_uninit,
1210 #ifdef CONFIG_NET_IPGRE_BROADCAST
1211 .ndo_open = ipgre_open,
1212 .ndo_stop = ipgre_close,
1214 .ndo_start_xmit = ipgre_tunnel_xmit,
1215 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1216 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1219 static void ipgre_tunnel_setup(struct net_device *dev)
1221 dev->netdev_ops = &ipgre_netdev_ops;
1222 dev->destructor = free_netdev;
1224 dev->type = ARPHRD_IPGRE;
1225 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1226 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1227 dev->flags = IFF_NOARP;
1230 dev->features |= NETIF_F_NETNS_LOCAL;
1231 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1234 static int ipgre_tunnel_init(struct net_device *dev)
1236 struct ip_tunnel *tunnel;
1239 tunnel = netdev_priv(dev);
1240 iph = &tunnel->parms.iph;
1243 strcpy(tunnel->parms.name, dev->name);
1245 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1246 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1249 #ifdef CONFIG_NET_IPGRE_BROADCAST
1250 if (ipv4_is_multicast(iph->daddr)) {
1253 dev->flags = IFF_BROADCAST;
1254 dev->header_ops = &ipgre_header_ops;
1258 dev->header_ops = &ipgre_header_ops;
1263 static void ipgre_fb_tunnel_init(struct net_device *dev)
1265 struct ip_tunnel *tunnel = netdev_priv(dev);
1266 struct iphdr *iph = &tunnel->parms.iph;
1267 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1270 strcpy(tunnel->parms.name, dev->name);
1273 iph->protocol = IPPROTO_GRE;
1275 tunnel->hlen = sizeof(struct iphdr) + 4;
1278 ign->tunnels_wc[0] = tunnel;
1282 static const struct gre_protocol ipgre_protocol = {
1283 .handler = ipgre_rcv,
1284 .err_handler = ipgre_err,
1287 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1291 for (prio = 0; prio < 4; prio++) {
1293 for (h = 0; h < HASH_SIZE; h++) {
1294 struct ip_tunnel *t = ign->tunnels[prio][h];
1297 unregister_netdevice_queue(t->dev, head);
1304 static int __net_init ipgre_init_net(struct net *net)
1306 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1309 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1310 ipgre_tunnel_setup);
1311 if (!ign->fb_tunnel_dev) {
1315 dev_net_set(ign->fb_tunnel_dev, net);
1317 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1318 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1320 if ((err = register_netdev(ign->fb_tunnel_dev)))
1326 free_netdev(ign->fb_tunnel_dev);
1331 static void __net_exit ipgre_exit_net(struct net *net)
1333 struct ipgre_net *ign;
1336 ign = net_generic(net, ipgre_net_id);
1338 ipgre_destroy_tunnels(ign, &list);
1339 unregister_netdevice_many(&list);
1343 static struct pernet_operations ipgre_net_ops = {
1344 .init = ipgre_init_net,
1345 .exit = ipgre_exit_net,
1346 .id = &ipgre_net_id,
1347 .size = sizeof(struct ipgre_net),
1350 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1358 if (data[IFLA_GRE_IFLAGS])
1359 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1360 if (data[IFLA_GRE_OFLAGS])
1361 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1362 if (flags & (GRE_VERSION|GRE_ROUTING))
1368 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1372 if (tb[IFLA_ADDRESS]) {
1373 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1375 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1376 return -EADDRNOTAVAIL;
1382 if (data[IFLA_GRE_REMOTE]) {
1383 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1389 return ipgre_tunnel_validate(tb, data);
1392 static void ipgre_netlink_parms(struct nlattr *data[],
1393 struct ip_tunnel_parm *parms)
1395 memset(parms, 0, sizeof(*parms));
1397 parms->iph.protocol = IPPROTO_GRE;
1402 if (data[IFLA_GRE_LINK])
1403 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1405 if (data[IFLA_GRE_IFLAGS])
1406 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1408 if (data[IFLA_GRE_OFLAGS])
1409 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1411 if (data[IFLA_GRE_IKEY])
1412 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1414 if (data[IFLA_GRE_OKEY])
1415 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1417 if (data[IFLA_GRE_LOCAL])
1418 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1420 if (data[IFLA_GRE_REMOTE])
1421 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1423 if (data[IFLA_GRE_TTL])
1424 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1426 if (data[IFLA_GRE_TOS])
1427 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1429 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1430 parms->iph.frag_off = htons(IP_DF);
1433 static int ipgre_tap_init(struct net_device *dev)
1435 struct ip_tunnel *tunnel;
1437 tunnel = netdev_priv(dev);
1440 strcpy(tunnel->parms.name, dev->name);
1442 ipgre_tunnel_bind_dev(dev);
1447 static const struct net_device_ops ipgre_tap_netdev_ops = {
1448 .ndo_init = ipgre_tap_init,
1449 .ndo_uninit = ipgre_tunnel_uninit,
1450 .ndo_start_xmit = ipgre_tunnel_xmit,
1451 .ndo_set_mac_address = eth_mac_addr,
1452 .ndo_validate_addr = eth_validate_addr,
1453 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1456 static void ipgre_tap_setup(struct net_device *dev)
1461 dev->netdev_ops = &ipgre_tap_netdev_ops;
1462 dev->destructor = free_netdev;
1465 dev->features |= NETIF_F_NETNS_LOCAL;
1468 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1469 struct nlattr *data[])
1471 struct ip_tunnel *nt;
1472 struct net *net = dev_net(dev);
1473 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1477 nt = netdev_priv(dev);
1478 ipgre_netlink_parms(data, &nt->parms);
1480 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1483 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1484 random_ether_addr(dev->dev_addr);
1486 mtu = ipgre_tunnel_bind_dev(dev);
1490 err = register_netdevice(dev);
1495 ipgre_tunnel_link(ign, nt);
1501 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1502 struct nlattr *data[])
1504 struct ip_tunnel *t, *nt;
1505 struct net *net = dev_net(dev);
1506 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1507 struct ip_tunnel_parm p;
1510 if (dev == ign->fb_tunnel_dev)
1513 nt = netdev_priv(dev);
1514 ipgre_netlink_parms(data, &p);
1516 t = ipgre_tunnel_locate(net, &p, 0);
1524 if (dev->type != ARPHRD_ETHER) {
1525 unsigned nflags = 0;
1527 if (ipv4_is_multicast(p.iph.daddr))
1528 nflags = IFF_BROADCAST;
1529 else if (p.iph.daddr)
1530 nflags = IFF_POINTOPOINT;
1532 if ((dev->flags ^ nflags) &
1533 (IFF_POINTOPOINT | IFF_BROADCAST))
1537 ipgre_tunnel_unlink(ign, t);
1538 t->parms.iph.saddr = p.iph.saddr;
1539 t->parms.iph.daddr = p.iph.daddr;
1540 t->parms.i_key = p.i_key;
1541 if (dev->type != ARPHRD_ETHER) {
1542 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1543 memcpy(dev->broadcast, &p.iph.daddr, 4);
1545 ipgre_tunnel_link(ign, t);
1546 netdev_state_change(dev);
1549 t->parms.o_key = p.o_key;
1550 t->parms.iph.ttl = p.iph.ttl;
1551 t->parms.iph.tos = p.iph.tos;
1552 t->parms.iph.frag_off = p.iph.frag_off;
1554 if (t->parms.link != p.link) {
1555 t->parms.link = p.link;
1556 mtu = ipgre_tunnel_bind_dev(dev);
1559 netdev_state_change(dev);
1565 static size_t ipgre_get_size(const struct net_device *dev)
1570 /* IFLA_GRE_IFLAGS */
1572 /* IFLA_GRE_OFLAGS */
1578 /* IFLA_GRE_LOCAL */
1580 /* IFLA_GRE_REMOTE */
1586 /* IFLA_GRE_PMTUDISC */
1591 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1593 struct ip_tunnel *t = netdev_priv(dev);
1594 struct ip_tunnel_parm *p = &t->parms;
1596 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1597 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1598 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1599 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1600 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1601 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1602 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1603 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1604 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1605 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1613 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1614 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1615 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1616 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1617 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1618 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1619 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1620 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1621 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1622 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1623 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1626 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1628 .maxtype = IFLA_GRE_MAX,
1629 .policy = ipgre_policy,
1630 .priv_size = sizeof(struct ip_tunnel),
1631 .setup = ipgre_tunnel_setup,
1632 .validate = ipgre_tunnel_validate,
1633 .newlink = ipgre_newlink,
1634 .changelink = ipgre_changelink,
1635 .get_size = ipgre_get_size,
1636 .fill_info = ipgre_fill_info,
1639 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1641 .maxtype = IFLA_GRE_MAX,
1642 .policy = ipgre_policy,
1643 .priv_size = sizeof(struct ip_tunnel),
1644 .setup = ipgre_tap_setup,
1645 .validate = ipgre_tap_validate,
1646 .newlink = ipgre_newlink,
1647 .changelink = ipgre_changelink,
1648 .get_size = ipgre_get_size,
1649 .fill_info = ipgre_fill_info,
1653 * And now the modules code and kernel interface.
1656 static int __init ipgre_init(void)
1660 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1662 err = register_pernet_device(&ipgre_net_ops);
1666 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1668 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669 goto add_proto_failed;
1672 err = rtnl_link_register(&ipgre_link_ops);
1674 goto rtnl_link_failed;
1676 err = rtnl_link_register(&ipgre_tap_ops);
1678 goto tap_ops_failed;
1684 rtnl_link_unregister(&ipgre_link_ops);
1686 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1688 unregister_pernet_device(&ipgre_net_ops);
1692 static void __exit ipgre_fini(void)
1694 rtnl_link_unregister(&ipgre_tap_ops);
1695 rtnl_link_unregister(&ipgre_link_ops);
1696 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1697 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698 unregister_pernet_device(&ipgre_net_ops);
1701 module_init(ipgre_init);
1702 module_exit(ipgre_fini);
1703 MODULE_LICENSE("GPL");
1704 MODULE_ALIAS_RTNL_LINK("gre");
1705 MODULE_ALIAS_RTNL_LINK("gretap");