]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/ipip.c
ipv4: Handle PMTU in all ICMP error handlers.
[karo-tx-linux.git] / net / ipv4 / ipip.c
1 /*
2  *      Linux NET3:     IP/IP protocol decoder.
3  *
4  *      Authors:
5  *              Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
6  *
7  *      Fixes:
8  *              Alan Cox        :       Merged and made usable non modular (its so tiny its silly as
9  *                                      a module taking up 2 pages).
10  *              Alan Cox        :       Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
11  *                                      to keep ip_forward happy.
12  *              Alan Cox        :       More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
13  *              Kai Schulte     :       Fixed #defines for IP_FIREWALL->FIREWALL
14  *              David Woodhouse :       Perform some basic ICMP handling.
15  *                                      IPIP Routing without decapsulation.
16  *              Carlos Picoto   :       GRE over IP support
17  *              Alexey Kuznetsov:       Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
18  *                                      I do not want to merge them together.
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  *
25  */
26
27 /* tunnel.c: an IP tunnel driver
28
29         The purpose of this driver is to provide an IP tunnel through
30         which you can tunnel network traffic transparently across subnets.
31
32         This was written by looking at Nick Holloway's dummy driver
33         Thanks for the great code!
34
35                 -Sam Lantinga   (slouken@cs.ucdavis.edu)  02/01/95
36
37         Minor tweaks:
38                 Cleaned up the code a little and added some pre-1.3.0 tweaks.
39                 dev->hard_header/hard_header_len changed to use no headers.
40                 Comments/bracketing tweaked.
41                 Made the tunnels use dev->name not tunnel: when error reporting.
42                 Added tx_dropped stat
43
44                 -Alan Cox       (alan@lxorguk.ukuu.org.uk) 21 March 95
45
46         Reworked:
47                 Changed to tunnel to destination gateway in addition to the
48                         tunnel's pointopoint address
49                 Almost completely rewritten
50                 Note:  There is currently no firewall or ICMP handling done.
51
52                 -Sam Lantinga   (slouken@cs.ucdavis.edu) 02/13/96
53
54 */
55
56 /* Things I wish I had known when writing the tunnel driver:
57
58         When the tunnel_xmit() function is called, the skb contains the
59         packet to be sent (plus a great deal of extra info), and dev
60         contains the tunnel device that _we_ are.
61
62         When we are passed a packet, we are expected to fill in the
63         source address with our source IP address.
64
65         What is the proper way to allocate, copy and free a buffer?
66         After you allocate it, it is a "0 length" chunk of memory
67         starting at zero.  If you want to add headers to the buffer
68         later, you'll have to call "skb_reserve(skb, amount)" with
69         the amount of memory you want reserved.  Then, you call
70         "skb_put(skb, amount)" with the amount of space you want in
71         the buffer.  skb_put() returns a pointer to the top (#0) of
72         that buffer.  skb->len is set to the amount of space you have
73         "allocated" with skb_put().  You can then write up to skb->len
74         bytes to that buffer.  If you need more, you can call skb_put()
75         again with the additional amount of space you need.  You can
76         find out how much more space you can allocate by calling
77         "skb_tailroom(skb)".
78         Now, to add header space, call "skb_push(skb, header_len)".
79         This creates space at the beginning of the buffer and returns
80         a pointer to this new space.  If later you need to strip a
81         header from a buffer, call "skb_pull(skb, header_len)".
82         skb_headroom() will return how much space is left at the top
83         of the buffer (before the main data).  Remember, this headroom
84         space must be reserved before the skb_put() function is called.
85         */
86
87 /*
88    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
89
90    For comments look at net/ipv4/ip_gre.c --ANK
91  */
92
93
94 #include <linux/capability.h>
95 #include <linux/module.h>
96 #include <linux/types.h>
97 #include <linux/kernel.h>
98 #include <linux/slab.h>
99 #include <asm/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <linux/in.h>
103 #include <linux/tcp.h>
104 #include <linux/udp.h>
105 #include <linux/if_arp.h>
106 #include <linux/mroute.h>
107 #include <linux/init.h>
108 #include <linux/netfilter_ipv4.h>
109 #include <linux/if_ether.h>
110
111 #include <net/sock.h>
112 #include <net/ip.h>
113 #include <net/icmp.h>
114 #include <net/ipip.h>
115 #include <net/inet_ecn.h>
116 #include <net/xfrm.h>
117 #include <net/net_namespace.h>
118 #include <net/netns/generic.h>
119
120 #define HASH_SIZE  16
121 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122
123 static int ipip_net_id __read_mostly;
124 struct ipip_net {
125         struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
126         struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
127         struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
128         struct ip_tunnel __rcu *tunnels_wc[1];
129         struct ip_tunnel __rcu **tunnels[4];
130
131         struct net_device *fb_tunnel_dev;
132 };
133
134 static int ipip_tunnel_init(struct net_device *dev);
135 static void ipip_tunnel_setup(struct net_device *dev);
136 static void ipip_dev_free(struct net_device *dev);
137
138 /*
139  * Locking : hash tables are protected by RCU and RTNL
140  */
141
142 #define for_each_ip_tunnel_rcu(start) \
143         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144
145 /* often modified stats are per cpu, other are shared (netdev->stats) */
146 struct pcpu_tstats {
147         u64     rx_packets;
148         u64     rx_bytes;
149         u64     tx_packets;
150         u64     tx_bytes;
151         struct u64_stats_sync   syncp;
152 };
153
154 static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
155                                                   struct rtnl_link_stats64 *tot)
156 {
157         int i;
158
159         for_each_possible_cpu(i) {
160                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
161                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
162                 unsigned int start;
163
164                 do {
165                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
166                         rx_packets = tstats->rx_packets;
167                         tx_packets = tstats->tx_packets;
168                         rx_bytes = tstats->rx_bytes;
169                         tx_bytes = tstats->tx_bytes;
170                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
171
172                 tot->rx_packets += rx_packets;
173                 tot->tx_packets += tx_packets;
174                 tot->rx_bytes   += rx_bytes;
175                 tot->tx_bytes   += tx_bytes;
176         }
177
178         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
179         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
180         tot->tx_dropped = dev->stats.tx_dropped;
181         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
182         tot->tx_errors = dev->stats.tx_errors;
183         tot->collisions = dev->stats.collisions;
184
185         return tot;
186 }
187
188 static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
189                 __be32 remote, __be32 local)
190 {
191         unsigned int h0 = HASH(remote);
192         unsigned int h1 = HASH(local);
193         struct ip_tunnel *t;
194         struct ipip_net *ipn = net_generic(net, ipip_net_id);
195
196         for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
197                 if (local == t->parms.iph.saddr &&
198                     remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
199                         return t;
200
201         for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
202                 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
203                         return t;
204
205         for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
206                 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
207                         return t;
208
209         t = rcu_dereference(ipn->tunnels_wc[0]);
210         if (t && (t->dev->flags&IFF_UP))
211                 return t;
212         return NULL;
213 }
214
215 static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
216                 struct ip_tunnel_parm *parms)
217 {
218         __be32 remote = parms->iph.daddr;
219         __be32 local = parms->iph.saddr;
220         unsigned int h = 0;
221         int prio = 0;
222
223         if (remote) {
224                 prio |= 2;
225                 h ^= HASH(remote);
226         }
227         if (local) {
228                 prio |= 1;
229                 h ^= HASH(local);
230         }
231         return &ipn->tunnels[prio][h];
232 }
233
234 static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
235                 struct ip_tunnel *t)
236 {
237         return __ipip_bucket(ipn, &t->parms);
238 }
239
240 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
241 {
242         struct ip_tunnel __rcu **tp;
243         struct ip_tunnel *iter;
244
245         for (tp = ipip_bucket(ipn, t);
246              (iter = rtnl_dereference(*tp)) != NULL;
247              tp = &iter->next) {
248                 if (t == iter) {
249                         rcu_assign_pointer(*tp, t->next);
250                         break;
251                 }
252         }
253 }
254
255 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
256 {
257         struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
258
259         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
260         rcu_assign_pointer(*tp, t);
261 }
262
263 static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
264                 struct ip_tunnel_parm *parms, int create)
265 {
266         __be32 remote = parms->iph.daddr;
267         __be32 local = parms->iph.saddr;
268         struct ip_tunnel *t, *nt;
269         struct ip_tunnel __rcu **tp;
270         struct net_device *dev;
271         char name[IFNAMSIZ];
272         struct ipip_net *ipn = net_generic(net, ipip_net_id);
273
274         for (tp = __ipip_bucket(ipn, parms);
275                  (t = rtnl_dereference(*tp)) != NULL;
276                  tp = &t->next) {
277                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
278                         return t;
279         }
280         if (!create)
281                 return NULL;
282
283         if (parms->name[0])
284                 strlcpy(name, parms->name, IFNAMSIZ);
285         else
286                 strcpy(name, "tunl%d");
287
288         dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
289         if (dev == NULL)
290                 return NULL;
291
292         dev_net_set(dev, net);
293
294         nt = netdev_priv(dev);
295         nt->parms = *parms;
296
297         if (ipip_tunnel_init(dev) < 0)
298                 goto failed_free;
299
300         if (register_netdevice(dev) < 0)
301                 goto failed_free;
302
303         strcpy(nt->parms.name, dev->name);
304
305         dev_hold(dev);
306         ipip_tunnel_link(ipn, nt);
307         return nt;
308
309 failed_free:
310         ipip_dev_free(dev);
311         return NULL;
312 }
313
314 /* called with RTNL */
315 static void ipip_tunnel_uninit(struct net_device *dev)
316 {
317         struct net *net = dev_net(dev);
318         struct ipip_net *ipn = net_generic(net, ipip_net_id);
319
320         if (dev == ipn->fb_tunnel_dev)
321                 RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
322         else
323                 ipip_tunnel_unlink(ipn, netdev_priv(dev));
324         dev_put(dev);
325 }
326
327 static int ipip_err(struct sk_buff *skb, u32 info)
328 {
329
330 /* All the routers (except for Linux) return only
331    8 bytes of packet payload. It means, that precise relaying of
332    ICMP in the real Internet is absolutely infeasible.
333  */
334         const struct iphdr *iph = (const struct iphdr *)skb->data;
335         const int type = icmp_hdr(skb)->type;
336         const int code = icmp_hdr(skb)->code;
337         struct ip_tunnel *t;
338         int err;
339
340         switch (type) {
341         default:
342         case ICMP_PARAMETERPROB:
343                 return 0;
344
345         case ICMP_DEST_UNREACH:
346                 switch (code) {
347                 case ICMP_SR_FAILED:
348                 case ICMP_PORT_UNREACH:
349                         /* Impossible event. */
350                         return 0;
351                 default:
352                         /* All others are translated to HOST_UNREACH.
353                            rfc2003 contains "deep thoughts" about NET_UNREACH,
354                            I believe they are just ether pollution. --ANK
355                          */
356                         break;
357                 }
358                 break;
359         case ICMP_TIME_EXCEEDED:
360                 if (code != ICMP_EXC_TTL)
361                         return 0;
362                 break;
363         }
364
365         err = -ENOENT;
366
367         rcu_read_lock();
368         t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
369         if (t == NULL)
370                 goto out;
371
372         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
373                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
374                                  t->dev->ifindex, 0, IPPROTO_IPIP, 0);
375                 err = 0;
376                 goto out;
377         }
378
379         if (t->parms.iph.daddr == 0)
380                 goto out;
381
382         err = 0;
383         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
384                 goto out;
385
386         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
387                 t->err_count++;
388         else
389                 t->err_count = 1;
390         t->err_time = jiffies;
391 out:
392         rcu_read_unlock();
393         return err;
394 }
395
396 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
397                                         struct sk_buff *skb)
398 {
399         struct iphdr *inner_iph = ip_hdr(skb);
400
401         if (INET_ECN_is_ce(outer_iph->tos))
402                 IP_ECN_set_ce(inner_iph);
403 }
404
405 static int ipip_rcv(struct sk_buff *skb)
406 {
407         struct ip_tunnel *tunnel;
408         const struct iphdr *iph = ip_hdr(skb);
409
410         rcu_read_lock();
411         tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
412         if (tunnel != NULL) {
413                 struct pcpu_tstats *tstats;
414
415                 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
416                         rcu_read_unlock();
417                         kfree_skb(skb);
418                         return 0;
419                 }
420
421                 secpath_reset(skb);
422
423                 skb->mac_header = skb->network_header;
424                 skb_reset_network_header(skb);
425                 skb->protocol = htons(ETH_P_IP);
426                 skb->pkt_type = PACKET_HOST;
427
428                 tstats = this_cpu_ptr(tunnel->dev->tstats);
429                 u64_stats_update_begin(&tstats->syncp);
430                 tstats->rx_packets++;
431                 tstats->rx_bytes += skb->len;
432                 u64_stats_update_end(&tstats->syncp);
433
434                 __skb_tunnel_rx(skb, tunnel->dev);
435
436                 ipip_ecn_decapsulate(iph, skb);
437
438                 netif_rx(skb);
439
440                 rcu_read_unlock();
441                 return 0;
442         }
443         rcu_read_unlock();
444
445         return -1;
446 }
447
448 /*
449  *      This function assumes it is being called from dev_queue_xmit()
450  *      and that skb is filled properly by that function.
451  */
452
453 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
454 {
455         struct ip_tunnel *tunnel = netdev_priv(dev);
456         struct pcpu_tstats *tstats;
457         const struct iphdr  *tiph = &tunnel->parms.iph;
458         u8     tos = tunnel->parms.iph.tos;
459         __be16 df = tiph->frag_off;
460         struct rtable *rt;                      /* Route to the other host */
461         struct net_device *tdev;                /* Device to other host */
462         const struct iphdr  *old_iph = ip_hdr(skb);
463         struct iphdr  *iph;                     /* Our new IP header */
464         unsigned int max_headroom;              /* The extra header space needed */
465         __be32 dst = tiph->daddr;
466         struct flowi4 fl4;
467         int    mtu;
468
469         if (skb->protocol != htons(ETH_P_IP))
470                 goto tx_error;
471
472         if (tos & 1)
473                 tos = old_iph->tos;
474
475         if (!dst) {
476                 /* NBMA tunnel */
477                 if ((rt = skb_rtable(skb)) == NULL) {
478                         dev->stats.tx_fifo_errors++;
479                         goto tx_error;
480                 }
481                 dst = rt->rt_gateway;
482         }
483
484         rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
485                                    dst, tiph->saddr,
486                                    0, 0,
487                                    IPPROTO_IPIP, RT_TOS(tos),
488                                    tunnel->parms.link);
489         if (IS_ERR(rt)) {
490                 dev->stats.tx_carrier_errors++;
491                 goto tx_error_icmp;
492         }
493         tdev = rt->dst.dev;
494
495         if (tdev == dev) {
496                 ip_rt_put(rt);
497                 dev->stats.collisions++;
498                 goto tx_error;
499         }
500
501         df |= old_iph->frag_off & htons(IP_DF);
502
503         if (df) {
504                 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
505
506                 if (mtu < 68) {
507                         dev->stats.collisions++;
508                         ip_rt_put(rt);
509                         goto tx_error;
510                 }
511
512                 if (skb_dst(skb))
513                         skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
514
515                 if ((old_iph->frag_off & htons(IP_DF)) &&
516                     mtu < ntohs(old_iph->tot_len)) {
517                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
518                                   htonl(mtu));
519                         ip_rt_put(rt);
520                         goto tx_error;
521                 }
522         }
523
524         if (tunnel->err_count > 0) {
525                 if (time_before(jiffies,
526                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
527                         tunnel->err_count--;
528                         dst_link_failure(skb);
529                 } else
530                         tunnel->err_count = 0;
531         }
532
533         /*
534          * Okay, now see if we can stuff it in the buffer as-is.
535          */
536         max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
537
538         if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
539             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
540                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
541                 if (!new_skb) {
542                         ip_rt_put(rt);
543                         dev->stats.tx_dropped++;
544                         dev_kfree_skb(skb);
545                         return NETDEV_TX_OK;
546                 }
547                 if (skb->sk)
548                         skb_set_owner_w(new_skb, skb->sk);
549                 dev_kfree_skb(skb);
550                 skb = new_skb;
551                 old_iph = ip_hdr(skb);
552         }
553
554         skb->transport_header = skb->network_header;
555         skb_push(skb, sizeof(struct iphdr));
556         skb_reset_network_header(skb);
557         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
558         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
559                               IPSKB_REROUTED);
560         skb_dst_drop(skb);
561         skb_dst_set(skb, &rt->dst);
562
563         /*
564          *      Push down and install the IPIP header.
565          */
566
567         iph                     =       ip_hdr(skb);
568         iph->version            =       4;
569         iph->ihl                =       sizeof(struct iphdr)>>2;
570         iph->frag_off           =       df;
571         iph->protocol           =       IPPROTO_IPIP;
572         iph->tos                =       INET_ECN_encapsulate(tos, old_iph->tos);
573         iph->daddr              =       fl4.daddr;
574         iph->saddr              =       fl4.saddr;
575
576         if ((iph->ttl = tiph->ttl) == 0)
577                 iph->ttl        =       old_iph->ttl;
578
579         nf_reset(skb);
580         tstats = this_cpu_ptr(dev->tstats);
581         __IPTUNNEL_XMIT(tstats, &dev->stats);
582         return NETDEV_TX_OK;
583
584 tx_error_icmp:
585         dst_link_failure(skb);
586 tx_error:
587         dev->stats.tx_errors++;
588         dev_kfree_skb(skb);
589         return NETDEV_TX_OK;
590 }
591
592 static void ipip_tunnel_bind_dev(struct net_device *dev)
593 {
594         struct net_device *tdev = NULL;
595         struct ip_tunnel *tunnel;
596         const struct iphdr *iph;
597
598         tunnel = netdev_priv(dev);
599         iph = &tunnel->parms.iph;
600
601         if (iph->daddr) {
602                 struct rtable *rt;
603                 struct flowi4 fl4;
604
605                 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
606                                            iph->daddr, iph->saddr,
607                                            0, 0,
608                                            IPPROTO_IPIP,
609                                            RT_TOS(iph->tos),
610                                            tunnel->parms.link);
611                 if (!IS_ERR(rt)) {
612                         tdev = rt->dst.dev;
613                         ip_rt_put(rt);
614                 }
615                 dev->flags |= IFF_POINTOPOINT;
616         }
617
618         if (!tdev && tunnel->parms.link)
619                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
620
621         if (tdev) {
622                 dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
623                 dev->mtu = tdev->mtu - sizeof(struct iphdr);
624         }
625         dev->iflink = tunnel->parms.link;
626 }
627
628 static int
629 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
630 {
631         int err = 0;
632         struct ip_tunnel_parm p;
633         struct ip_tunnel *t;
634         struct net *net = dev_net(dev);
635         struct ipip_net *ipn = net_generic(net, ipip_net_id);
636
637         switch (cmd) {
638         case SIOCGETTUNNEL:
639                 t = NULL;
640                 if (dev == ipn->fb_tunnel_dev) {
641                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
642                                 err = -EFAULT;
643                                 break;
644                         }
645                         t = ipip_tunnel_locate(net, &p, 0);
646                 }
647                 if (t == NULL)
648                         t = netdev_priv(dev);
649                 memcpy(&p, &t->parms, sizeof(p));
650                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
651                         err = -EFAULT;
652                 break;
653
654         case SIOCADDTUNNEL:
655         case SIOCCHGTUNNEL:
656                 err = -EPERM;
657                 if (!capable(CAP_NET_ADMIN))
658                         goto done;
659
660                 err = -EFAULT;
661                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
662                         goto done;
663
664                 err = -EINVAL;
665                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
666                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
667                         goto done;
668                 if (p.iph.ttl)
669                         p.iph.frag_off |= htons(IP_DF);
670
671                 t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
672
673                 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
674                         if (t != NULL) {
675                                 if (t->dev != dev) {
676                                         err = -EEXIST;
677                                         break;
678                                 }
679                         } else {
680                                 if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
681                                     (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
682                                         err = -EINVAL;
683                                         break;
684                                 }
685                                 t = netdev_priv(dev);
686                                 ipip_tunnel_unlink(ipn, t);
687                                 synchronize_net();
688                                 t->parms.iph.saddr = p.iph.saddr;
689                                 t->parms.iph.daddr = p.iph.daddr;
690                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
691                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
692                                 ipip_tunnel_link(ipn, t);
693                                 netdev_state_change(dev);
694                         }
695                 }
696
697                 if (t) {
698                         err = 0;
699                         if (cmd == SIOCCHGTUNNEL) {
700                                 t->parms.iph.ttl = p.iph.ttl;
701                                 t->parms.iph.tos = p.iph.tos;
702                                 t->parms.iph.frag_off = p.iph.frag_off;
703                                 if (t->parms.link != p.link) {
704                                         t->parms.link = p.link;
705                                         ipip_tunnel_bind_dev(dev);
706                                         netdev_state_change(dev);
707                                 }
708                         }
709                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
710                                 err = -EFAULT;
711                 } else
712                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
713                 break;
714
715         case SIOCDELTUNNEL:
716                 err = -EPERM;
717                 if (!capable(CAP_NET_ADMIN))
718                         goto done;
719
720                 if (dev == ipn->fb_tunnel_dev) {
721                         err = -EFAULT;
722                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
723                                 goto done;
724                         err = -ENOENT;
725                         if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
726                                 goto done;
727                         err = -EPERM;
728                         if (t->dev == ipn->fb_tunnel_dev)
729                                 goto done;
730                         dev = t->dev;
731                 }
732                 unregister_netdevice(dev);
733                 err = 0;
734                 break;
735
736         default:
737                 err = -EINVAL;
738         }
739
740 done:
741         return err;
742 }
743
744 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
745 {
746         if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
747                 return -EINVAL;
748         dev->mtu = new_mtu;
749         return 0;
750 }
751
752 static const struct net_device_ops ipip_netdev_ops = {
753         .ndo_uninit     = ipip_tunnel_uninit,
754         .ndo_start_xmit = ipip_tunnel_xmit,
755         .ndo_do_ioctl   = ipip_tunnel_ioctl,
756         .ndo_change_mtu = ipip_tunnel_change_mtu,
757         .ndo_get_stats64 = ipip_get_stats64,
758 };
759
760 static void ipip_dev_free(struct net_device *dev)
761 {
762         free_percpu(dev->tstats);
763         free_netdev(dev);
764 }
765
766 static void ipip_tunnel_setup(struct net_device *dev)
767 {
768         dev->netdev_ops         = &ipip_netdev_ops;
769         dev->destructor         = ipip_dev_free;
770
771         dev->type               = ARPHRD_TUNNEL;
772         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr);
773         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr);
774         dev->flags              = IFF_NOARP;
775         dev->iflink             = 0;
776         dev->addr_len           = 4;
777         dev->features           |= NETIF_F_NETNS_LOCAL;
778         dev->features           |= NETIF_F_LLTX;
779         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
780 }
781
782 static int ipip_tunnel_init(struct net_device *dev)
783 {
784         struct ip_tunnel *tunnel = netdev_priv(dev);
785
786         tunnel->dev = dev;
787
788         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
789         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
790
791         ipip_tunnel_bind_dev(dev);
792
793         dev->tstats = alloc_percpu(struct pcpu_tstats);
794         if (!dev->tstats)
795                 return -ENOMEM;
796
797         return 0;
798 }
799
800 static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
801 {
802         struct ip_tunnel *tunnel = netdev_priv(dev);
803         struct iphdr *iph = &tunnel->parms.iph;
804         struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
805
806         tunnel->dev = dev;
807         strcpy(tunnel->parms.name, dev->name);
808
809         iph->version            = 4;
810         iph->protocol           = IPPROTO_IPIP;
811         iph->ihl                = 5;
812
813         dev->tstats = alloc_percpu(struct pcpu_tstats);
814         if (!dev->tstats)
815                 return -ENOMEM;
816
817         dev_hold(dev);
818         rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
819         return 0;
820 }
821
822 static struct xfrm_tunnel ipip_handler __read_mostly = {
823         .handler        =       ipip_rcv,
824         .err_handler    =       ipip_err,
825         .priority       =       1,
826 };
827
828 static const char banner[] __initconst =
829         KERN_INFO "IPv4 over IPv4 tunneling driver\n";
830
831 static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
832 {
833         int prio;
834
835         for (prio = 1; prio < 4; prio++) {
836                 int h;
837                 for (h = 0; h < HASH_SIZE; h++) {
838                         struct ip_tunnel *t;
839
840                         t = rtnl_dereference(ipn->tunnels[prio][h]);
841                         while (t != NULL) {
842                                 unregister_netdevice_queue(t->dev, head);
843                                 t = rtnl_dereference(t->next);
844                         }
845                 }
846         }
847 }
848
849 static int __net_init ipip_init_net(struct net *net)
850 {
851         struct ipip_net *ipn = net_generic(net, ipip_net_id);
852         struct ip_tunnel *t;
853         int err;
854
855         ipn->tunnels[0] = ipn->tunnels_wc;
856         ipn->tunnels[1] = ipn->tunnels_l;
857         ipn->tunnels[2] = ipn->tunnels_r;
858         ipn->tunnels[3] = ipn->tunnels_r_l;
859
860         ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
861                                            "tunl0",
862                                            ipip_tunnel_setup);
863         if (!ipn->fb_tunnel_dev) {
864                 err = -ENOMEM;
865                 goto err_alloc_dev;
866         }
867         dev_net_set(ipn->fb_tunnel_dev, net);
868
869         err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
870         if (err)
871                 goto err_reg_dev;
872
873         if ((err = register_netdev(ipn->fb_tunnel_dev)))
874                 goto err_reg_dev;
875
876         t = netdev_priv(ipn->fb_tunnel_dev);
877
878         strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
879         return 0;
880
881 err_reg_dev:
882         ipip_dev_free(ipn->fb_tunnel_dev);
883 err_alloc_dev:
884         /* nothing */
885         return err;
886 }
887
888 static void __net_exit ipip_exit_net(struct net *net)
889 {
890         struct ipip_net *ipn = net_generic(net, ipip_net_id);
891         LIST_HEAD(list);
892
893         rtnl_lock();
894         ipip_destroy_tunnels(ipn, &list);
895         unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
896         unregister_netdevice_many(&list);
897         rtnl_unlock();
898 }
899
900 static struct pernet_operations ipip_net_ops = {
901         .init = ipip_init_net,
902         .exit = ipip_exit_net,
903         .id   = &ipip_net_id,
904         .size = sizeof(struct ipip_net),
905 };
906
907 static int __init ipip_init(void)
908 {
909         int err;
910
911         printk(banner);
912
913         err = register_pernet_device(&ipip_net_ops);
914         if (err < 0)
915                 return err;
916         err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
917         if (err < 0) {
918                 unregister_pernet_device(&ipip_net_ops);
919                 pr_info("%s: can't register tunnel\n", __func__);
920         }
921         return err;
922 }
923
924 static void __exit ipip_fini(void)
925 {
926         if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
927                 pr_info("%s: can't deregister tunnel\n", __func__);
928
929         unregister_pernet_device(&ipip_net_ops);
930 }
931
932 module_init(ipip_init);
933 module_exit(ipip_fini);
934 MODULE_LICENSE("GPL");
935 MODULE_ALIAS_NETDEV("tunl0");