]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/ip_tunnel.c
Merge tag 'platform-drivers-x86-v4.12-2' of git://git.infradead.org/linux-platform...
[karo-tx-linux.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         if (parms->name[0])
257                 strlcpy(name, parms->name, IFNAMSIZ);
258         else {
259                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260                         err = -E2BIG;
261                         goto failed;
262                 }
263                 strlcpy(name, ops->kind, IFNAMSIZ);
264                 strncat(name, "%d", 2);
265         }
266
267         ASSERT_RTNL();
268         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269         if (!dev) {
270                 err = -ENOMEM;
271                 goto failed;
272         }
273         dev_net_set(dev, net);
274
275         dev->rtnl_link_ops = ops;
276
277         tunnel = netdev_priv(dev);
278         tunnel->parms = *parms;
279         tunnel->net = net;
280
281         err = register_netdevice(dev);
282         if (err)
283                 goto failed_free;
284
285         return dev;
286
287 failed_free:
288         free_netdev(dev);
289 failed:
290         return ERR_PTR(err);
291 }
292
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294                                     int proto,
295                                     __be32 daddr, __be32 saddr,
296                                     __be32 key, __u8 tos, int oif,
297                                     __u32 mark)
298 {
299         memset(fl4, 0, sizeof(*fl4));
300         fl4->flowi4_oif = oif;
301         fl4->daddr = daddr;
302         fl4->saddr = saddr;
303         fl4->flowi4_tos = tos;
304         fl4->flowi4_proto = proto;
305         fl4->fl4_gre_key = key;
306         fl4->flowi4_mark = mark;
307 }
308
309 static int ip_tunnel_bind_dev(struct net_device *dev)
310 {
311         struct net_device *tdev = NULL;
312         struct ip_tunnel *tunnel = netdev_priv(dev);
313         const struct iphdr *iph;
314         int hlen = LL_MAX_HEADER;
315         int mtu = ETH_DATA_LEN;
316         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
317
318         iph = &tunnel->parms.iph;
319
320         /* Guess output device to choose reasonable mtu and needed_headroom */
321         if (iph->daddr) {
322                 struct flowi4 fl4;
323                 struct rtable *rt;
324
325                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
326                                  iph->saddr, tunnel->parms.o_key,
327                                  RT_TOS(iph->tos), tunnel->parms.link,
328                                  tunnel->fwmark);
329                 rt = ip_route_output_key(tunnel->net, &fl4);
330
331                 if (!IS_ERR(rt)) {
332                         tdev = rt->dst.dev;
333                         ip_rt_put(rt);
334                 }
335                 if (dev->type != ARPHRD_ETHER)
336                         dev->flags |= IFF_POINTOPOINT;
337
338                 dst_cache_reset(&tunnel->dst_cache);
339         }
340
341         if (!tdev && tunnel->parms.link)
342                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
343
344         if (tdev) {
345                 hlen = tdev->hard_header_len + tdev->needed_headroom;
346                 mtu = tdev->mtu;
347         }
348
349         dev->needed_headroom = t_hlen + hlen;
350         mtu -= (dev->hard_header_len + t_hlen);
351
352         if (mtu < 68)
353                 mtu = 68;
354
355         return mtu;
356 }
357
358 static struct ip_tunnel *ip_tunnel_create(struct net *net,
359                                           struct ip_tunnel_net *itn,
360                                           struct ip_tunnel_parm *parms)
361 {
362         struct ip_tunnel *nt;
363         struct net_device *dev;
364         int t_hlen;
365
366         BUG_ON(!itn->fb_tunnel_dev);
367         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
368         if (IS_ERR(dev))
369                 return ERR_CAST(dev);
370
371         dev->mtu = ip_tunnel_bind_dev(dev);
372
373         nt = netdev_priv(dev);
374         t_hlen = nt->hlen + sizeof(struct iphdr);
375         dev->min_mtu = ETH_MIN_MTU;
376         dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
377         ip_tunnel_add(itn, nt);
378         return nt;
379 }
380
381 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
382                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
383                   bool log_ecn_error)
384 {
385         struct pcpu_sw_netstats *tstats;
386         const struct iphdr *iph = ip_hdr(skb);
387         int err;
388
389 #ifdef CONFIG_NET_IPGRE_BROADCAST
390         if (ipv4_is_multicast(iph->daddr)) {
391                 tunnel->dev->stats.multicast++;
392                 skb->pkt_type = PACKET_BROADCAST;
393         }
394 #endif
395
396         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
397              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
398                 tunnel->dev->stats.rx_crc_errors++;
399                 tunnel->dev->stats.rx_errors++;
400                 goto drop;
401         }
402
403         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
404                 if (!(tpi->flags&TUNNEL_SEQ) ||
405                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
406                         tunnel->dev->stats.rx_fifo_errors++;
407                         tunnel->dev->stats.rx_errors++;
408                         goto drop;
409                 }
410                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
411         }
412
413         skb_reset_network_header(skb);
414
415         err = IP_ECN_decapsulate(iph, skb);
416         if (unlikely(err)) {
417                 if (log_ecn_error)
418                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
419                                         &iph->saddr, iph->tos);
420                 if (err > 1) {
421                         ++tunnel->dev->stats.rx_frame_errors;
422                         ++tunnel->dev->stats.rx_errors;
423                         goto drop;
424                 }
425         }
426
427         tstats = this_cpu_ptr(tunnel->dev->tstats);
428         u64_stats_update_begin(&tstats->syncp);
429         tstats->rx_packets++;
430         tstats->rx_bytes += skb->len;
431         u64_stats_update_end(&tstats->syncp);
432
433         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
434
435         if (tunnel->dev->type == ARPHRD_ETHER) {
436                 skb->protocol = eth_type_trans(skb, tunnel->dev);
437                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
438         } else {
439                 skb->dev = tunnel->dev;
440         }
441
442         if (tun_dst)
443                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
444
445         gro_cells_receive(&tunnel->gro_cells, skb);
446         return 0;
447
448 drop:
449         kfree_skb(skb);
450         return 0;
451 }
452 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
453
454 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
455                             unsigned int num)
456 {
457         if (num >= MAX_IPTUN_ENCAP_OPS)
458                 return -ERANGE;
459
460         return !cmpxchg((const struct ip_tunnel_encap_ops **)
461                         &iptun_encaps[num],
462                         NULL, ops) ? 0 : -1;
463 }
464 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
465
466 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
467                             unsigned int num)
468 {
469         int ret;
470
471         if (num >= MAX_IPTUN_ENCAP_OPS)
472                 return -ERANGE;
473
474         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
475                        &iptun_encaps[num],
476                        ops, NULL) == ops) ? 0 : -1;
477
478         synchronize_net();
479
480         return ret;
481 }
482 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
483
484 int ip_tunnel_encap_setup(struct ip_tunnel *t,
485                           struct ip_tunnel_encap *ipencap)
486 {
487         int hlen;
488
489         memset(&t->encap, 0, sizeof(t->encap));
490
491         hlen = ip_encap_hlen(ipencap);
492         if (hlen < 0)
493                 return hlen;
494
495         t->encap.type = ipencap->type;
496         t->encap.sport = ipencap->sport;
497         t->encap.dport = ipencap->dport;
498         t->encap.flags = ipencap->flags;
499
500         t->encap_hlen = hlen;
501         t->hlen = t->encap_hlen + t->tun_hlen;
502
503         return 0;
504 }
505 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
506
507 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
508                             struct rtable *rt, __be16 df,
509                             const struct iphdr *inner_iph)
510 {
511         struct ip_tunnel *tunnel = netdev_priv(dev);
512         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
513         int mtu;
514
515         if (df)
516                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
517                                         - sizeof(struct iphdr) - tunnel->hlen;
518         else
519                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
520
521         if (skb_dst(skb))
522                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
523
524         if (skb->protocol == htons(ETH_P_IP)) {
525                 if (!skb_is_gso(skb) &&
526                     (inner_iph->frag_off & htons(IP_DF)) &&
527                     mtu < pkt_size) {
528                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
529                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
530                         return -E2BIG;
531                 }
532         }
533 #if IS_ENABLED(CONFIG_IPV6)
534         else if (skb->protocol == htons(ETH_P_IPV6)) {
535                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
536
537                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
538                            mtu >= IPV6_MIN_MTU) {
539                         if ((tunnel->parms.iph.daddr &&
540                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
541                             rt6->rt6i_dst.plen == 128) {
542                                 rt6->rt6i_flags |= RTF_MODIFIED;
543                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
544                         }
545                 }
546
547                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
548                                         mtu < pkt_size) {
549                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550                         return -E2BIG;
551                 }
552         }
553 #endif
554         return 0;
555 }
556
557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
558 {
559         struct ip_tunnel *tunnel = netdev_priv(dev);
560         u32 headroom = sizeof(struct iphdr);
561         struct ip_tunnel_info *tun_info;
562         const struct ip_tunnel_key *key;
563         const struct iphdr *inner_iph;
564         struct rtable *rt;
565         struct flowi4 fl4;
566         __be16 df = 0;
567         u8 tos, ttl;
568
569         tun_info = skb_tunnel_info(skb);
570         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
571                      ip_tunnel_info_af(tun_info) != AF_INET))
572                 goto tx_error;
573         key = &tun_info->key;
574         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
575         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
576         tos = key->tos;
577         if (tos == 1) {
578                 if (skb->protocol == htons(ETH_P_IP))
579                         tos = inner_iph->tos;
580                 else if (skb->protocol == htons(ETH_P_IPV6))
581                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
582         }
583         init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
584                          RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
585         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
586                 goto tx_error;
587         rt = ip_route_output_key(tunnel->net, &fl4);
588         if (IS_ERR(rt)) {
589                 dev->stats.tx_carrier_errors++;
590                 goto tx_error;
591         }
592         if (rt->dst.dev == dev) {
593                 ip_rt_put(rt);
594                 dev->stats.collisions++;
595                 goto tx_error;
596         }
597         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
598         ttl = key->ttl;
599         if (ttl == 0) {
600                 if (skb->protocol == htons(ETH_P_IP))
601                         ttl = inner_iph->ttl;
602                 else if (skb->protocol == htons(ETH_P_IPV6))
603                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
604                 else
605                         ttl = ip4_dst_hoplimit(&rt->dst);
606         }
607         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
608                 df = htons(IP_DF);
609         else if (skb->protocol == htons(ETH_P_IP))
610                 df = inner_iph->frag_off & htons(IP_DF);
611         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612         if (headroom > dev->needed_headroom)
613                 dev->needed_headroom = headroom;
614
615         if (skb_cow_head(skb, dev->needed_headroom)) {
616                 ip_rt_put(rt);
617                 goto tx_dropped;
618         }
619         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
620                       key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
621         return;
622 tx_error:
623         dev->stats.tx_errors++;
624         goto kfree;
625 tx_dropped:
626         dev->stats.tx_dropped++;
627 kfree:
628         kfree_skb(skb);
629 }
630 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631
632 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633                     const struct iphdr *tnl_params, u8 protocol)
634 {
635         struct ip_tunnel *tunnel = netdev_priv(dev);
636         const struct iphdr *inner_iph;
637         struct flowi4 fl4;
638         u8     tos, ttl;
639         __be16 df;
640         struct rtable *rt;              /* Route to the other host */
641         unsigned int max_headroom;      /* The extra header space needed */
642         __be32 dst;
643         bool connected;
644
645         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
646         connected = (tunnel->parms.iph.daddr != 0);
647
648         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
649
650         dst = tnl_params->daddr;
651         if (dst == 0) {
652                 /* NBMA tunnel */
653
654                 if (!skb_dst(skb)) {
655                         dev->stats.tx_fifo_errors++;
656                         goto tx_error;
657                 }
658
659                 if (skb->protocol == htons(ETH_P_IP)) {
660                         rt = skb_rtable(skb);
661                         dst = rt_nexthop(rt, inner_iph->daddr);
662                 }
663 #if IS_ENABLED(CONFIG_IPV6)
664                 else if (skb->protocol == htons(ETH_P_IPV6)) {
665                         const struct in6_addr *addr6;
666                         struct neighbour *neigh;
667                         bool do_tx_error_icmp;
668                         int addr_type;
669
670                         neigh = dst_neigh_lookup(skb_dst(skb),
671                                                  &ipv6_hdr(skb)->daddr);
672                         if (!neigh)
673                                 goto tx_error;
674
675                         addr6 = (const struct in6_addr *)&neigh->primary_key;
676                         addr_type = ipv6_addr_type(addr6);
677
678                         if (addr_type == IPV6_ADDR_ANY) {
679                                 addr6 = &ipv6_hdr(skb)->daddr;
680                                 addr_type = ipv6_addr_type(addr6);
681                         }
682
683                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
684                                 do_tx_error_icmp = true;
685                         else {
686                                 do_tx_error_icmp = false;
687                                 dst = addr6->s6_addr32[3];
688                         }
689                         neigh_release(neigh);
690                         if (do_tx_error_icmp)
691                                 goto tx_error_icmp;
692                 }
693 #endif
694                 else
695                         goto tx_error;
696
697                 connected = false;
698         }
699
700         tos = tnl_params->tos;
701         if (tos & 0x1) {
702                 tos &= ~0x1;
703                 if (skb->protocol == htons(ETH_P_IP)) {
704                         tos = inner_iph->tos;
705                         connected = false;
706                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
707                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
708                         connected = false;
709                 }
710         }
711
712         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
713                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
714                          tunnel->fwmark);
715
716         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
717                 goto tx_error;
718
719         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
720                          NULL;
721
722         if (!rt) {
723                 rt = ip_route_output_key(tunnel->net, &fl4);
724
725                 if (IS_ERR(rt)) {
726                         dev->stats.tx_carrier_errors++;
727                         goto tx_error;
728                 }
729                 if (connected)
730                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
731                                           fl4.saddr);
732         }
733
734         if (rt->dst.dev == dev) {
735                 ip_rt_put(rt);
736                 dev->stats.collisions++;
737                 goto tx_error;
738         }
739
740         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
741                 ip_rt_put(rt);
742                 goto tx_error;
743         }
744
745         if (tunnel->err_count > 0) {
746                 if (time_before(jiffies,
747                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
748                         tunnel->err_count--;
749
750                         dst_link_failure(skb);
751                 } else
752                         tunnel->err_count = 0;
753         }
754
755         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
756         ttl = tnl_params->ttl;
757         if (ttl == 0) {
758                 if (skb->protocol == htons(ETH_P_IP))
759                         ttl = inner_iph->ttl;
760 #if IS_ENABLED(CONFIG_IPV6)
761                 else if (skb->protocol == htons(ETH_P_IPV6))
762                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
763 #endif
764                 else
765                         ttl = ip4_dst_hoplimit(&rt->dst);
766         }
767
768         df = tnl_params->frag_off;
769         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
770                 df |= (inner_iph->frag_off&htons(IP_DF));
771
772         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
773                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
774         if (max_headroom > dev->needed_headroom)
775                 dev->needed_headroom = max_headroom;
776
777         if (skb_cow_head(skb, dev->needed_headroom)) {
778                 ip_rt_put(rt);
779                 dev->stats.tx_dropped++;
780                 kfree_skb(skb);
781                 return;
782         }
783
784         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
785                       df, !net_eq(tunnel->net, dev_net(dev)));
786         return;
787
788 #if IS_ENABLED(CONFIG_IPV6)
789 tx_error_icmp:
790         dst_link_failure(skb);
791 #endif
792 tx_error:
793         dev->stats.tx_errors++;
794         kfree_skb(skb);
795 }
796 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
797
798 static void ip_tunnel_update(struct ip_tunnel_net *itn,
799                              struct ip_tunnel *t,
800                              struct net_device *dev,
801                              struct ip_tunnel_parm *p,
802                              bool set_mtu,
803                              __u32 fwmark)
804 {
805         ip_tunnel_del(itn, t);
806         t->parms.iph.saddr = p->iph.saddr;
807         t->parms.iph.daddr = p->iph.daddr;
808         t->parms.i_key = p->i_key;
809         t->parms.o_key = p->o_key;
810         if (dev->type != ARPHRD_ETHER) {
811                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
812                 memcpy(dev->broadcast, &p->iph.daddr, 4);
813         }
814         ip_tunnel_add(itn, t);
815
816         t->parms.iph.ttl = p->iph.ttl;
817         t->parms.iph.tos = p->iph.tos;
818         t->parms.iph.frag_off = p->iph.frag_off;
819
820         if (t->parms.link != p->link || t->fwmark != fwmark) {
821                 int mtu;
822
823                 t->parms.link = p->link;
824                 t->fwmark = fwmark;
825                 mtu = ip_tunnel_bind_dev(dev);
826                 if (set_mtu)
827                         dev->mtu = mtu;
828         }
829         dst_cache_reset(&t->dst_cache);
830         netdev_state_change(dev);
831 }
832
833 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
834 {
835         int err = 0;
836         struct ip_tunnel *t = netdev_priv(dev);
837         struct net *net = t->net;
838         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
839
840         BUG_ON(!itn->fb_tunnel_dev);
841         switch (cmd) {
842         case SIOCGETTUNNEL:
843                 if (dev == itn->fb_tunnel_dev) {
844                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
845                         if (!t)
846                                 t = netdev_priv(dev);
847                 }
848                 memcpy(p, &t->parms, sizeof(*p));
849                 break;
850
851         case SIOCADDTUNNEL:
852         case SIOCCHGTUNNEL:
853                 err = -EPERM;
854                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
855                         goto done;
856                 if (p->iph.ttl)
857                         p->iph.frag_off |= htons(IP_DF);
858                 if (!(p->i_flags & VTI_ISVTI)) {
859                         if (!(p->i_flags & TUNNEL_KEY))
860                                 p->i_key = 0;
861                         if (!(p->o_flags & TUNNEL_KEY))
862                                 p->o_key = 0;
863                 }
864
865                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
866
867                 if (cmd == SIOCADDTUNNEL) {
868                         if (!t) {
869                                 t = ip_tunnel_create(net, itn, p);
870                                 err = PTR_ERR_OR_ZERO(t);
871                                 break;
872                         }
873
874                         err = -EEXIST;
875                         break;
876                 }
877                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
878                         if (t) {
879                                 if (t->dev != dev) {
880                                         err = -EEXIST;
881                                         break;
882                                 }
883                         } else {
884                                 unsigned int nflags = 0;
885
886                                 if (ipv4_is_multicast(p->iph.daddr))
887                                         nflags = IFF_BROADCAST;
888                                 else if (p->iph.daddr)
889                                         nflags = IFF_POINTOPOINT;
890
891                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
892                                         err = -EINVAL;
893                                         break;
894                                 }
895
896                                 t = netdev_priv(dev);
897                         }
898                 }
899
900                 if (t) {
901                         err = 0;
902                         ip_tunnel_update(itn, t, dev, p, true, 0);
903                 } else {
904                         err = -ENOENT;
905                 }
906                 break;
907
908         case SIOCDELTUNNEL:
909                 err = -EPERM;
910                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
911                         goto done;
912
913                 if (dev == itn->fb_tunnel_dev) {
914                         err = -ENOENT;
915                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
916                         if (!t)
917                                 goto done;
918                         err = -EPERM;
919                         if (t == netdev_priv(itn->fb_tunnel_dev))
920                                 goto done;
921                         dev = t->dev;
922                 }
923                 unregister_netdevice(dev);
924                 err = 0;
925                 break;
926
927         default:
928                 err = -EINVAL;
929         }
930
931 done:
932         return err;
933 }
934 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
935
936 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
937 {
938         struct ip_tunnel *tunnel = netdev_priv(dev);
939         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
940         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
941
942         if (new_mtu < ETH_MIN_MTU)
943                 return -EINVAL;
944
945         if (new_mtu > max_mtu) {
946                 if (strict)
947                         return -EINVAL;
948
949                 new_mtu = max_mtu;
950         }
951
952         dev->mtu = new_mtu;
953         return 0;
954 }
955 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
956
957 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
958 {
959         return __ip_tunnel_change_mtu(dev, new_mtu, true);
960 }
961 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
962
963 static void ip_tunnel_dev_free(struct net_device *dev)
964 {
965         struct ip_tunnel *tunnel = netdev_priv(dev);
966
967         gro_cells_destroy(&tunnel->gro_cells);
968         dst_cache_destroy(&tunnel->dst_cache);
969         free_percpu(dev->tstats);
970 }
971
972 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
973 {
974         struct ip_tunnel *tunnel = netdev_priv(dev);
975         struct ip_tunnel_net *itn;
976
977         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
978
979         if (itn->fb_tunnel_dev != dev) {
980                 ip_tunnel_del(itn, netdev_priv(dev));
981                 unregister_netdevice_queue(dev, head);
982         }
983 }
984 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
985
986 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
987 {
988         struct ip_tunnel *tunnel = netdev_priv(dev);
989
990         return tunnel->net;
991 }
992 EXPORT_SYMBOL(ip_tunnel_get_link_net);
993
994 int ip_tunnel_get_iflink(const struct net_device *dev)
995 {
996         struct ip_tunnel *tunnel = netdev_priv(dev);
997
998         return tunnel->parms.link;
999 }
1000 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1001
1002 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1003                                   struct rtnl_link_ops *ops, char *devname)
1004 {
1005         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1006         struct ip_tunnel_parm parms;
1007         unsigned int i;
1008
1009         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1010                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1011
1012         if (!ops) {
1013                 itn->fb_tunnel_dev = NULL;
1014                 return 0;
1015         }
1016
1017         memset(&parms, 0, sizeof(parms));
1018         if (devname)
1019                 strlcpy(parms.name, devname, IFNAMSIZ);
1020
1021         rtnl_lock();
1022         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1023         /* FB netdevice is special: we have one, and only one per netns.
1024          * Allowing to move it to another netns is clearly unsafe.
1025          */
1026         if (!IS_ERR(itn->fb_tunnel_dev)) {
1027                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1028                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1029                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1030         }
1031         rtnl_unlock();
1032
1033         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1034 }
1035 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1036
1037 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1038                               struct rtnl_link_ops *ops)
1039 {
1040         struct net *net = dev_net(itn->fb_tunnel_dev);
1041         struct net_device *dev, *aux;
1042         int h;
1043
1044         for_each_netdev_safe(net, dev, aux)
1045                 if (dev->rtnl_link_ops == ops)
1046                         unregister_netdevice_queue(dev, head);
1047
1048         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1049                 struct ip_tunnel *t;
1050                 struct hlist_node *n;
1051                 struct hlist_head *thead = &itn->tunnels[h];
1052
1053                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1054                         /* If dev is in the same netns, it has already
1055                          * been added to the list by the previous loop.
1056                          */
1057                         if (!net_eq(dev_net(t->dev), net))
1058                                 unregister_netdevice_queue(t->dev, head);
1059         }
1060 }
1061
1062 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1063 {
1064         LIST_HEAD(list);
1065
1066         rtnl_lock();
1067         ip_tunnel_destroy(itn, &list, ops);
1068         unregister_netdevice_many(&list);
1069         rtnl_unlock();
1070 }
1071 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1072
1073 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1074                       struct ip_tunnel_parm *p, __u32 fwmark)
1075 {
1076         struct ip_tunnel *nt;
1077         struct net *net = dev_net(dev);
1078         struct ip_tunnel_net *itn;
1079         int mtu;
1080         int err;
1081
1082         nt = netdev_priv(dev);
1083         itn = net_generic(net, nt->ip_tnl_net_id);
1084
1085         if (nt->collect_md) {
1086                 if (rtnl_dereference(itn->collect_md_tun))
1087                         return -EEXIST;
1088         } else {
1089                 if (ip_tunnel_find(itn, p, dev->type))
1090                         return -EEXIST;
1091         }
1092
1093         nt->net = net;
1094         nt->parms = *p;
1095         nt->fwmark = fwmark;
1096         err = register_netdevice(dev);
1097         if (err)
1098                 goto out;
1099
1100         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1101                 eth_hw_addr_random(dev);
1102
1103         mtu = ip_tunnel_bind_dev(dev);
1104         if (!tb[IFLA_MTU])
1105                 dev->mtu = mtu;
1106
1107         ip_tunnel_add(itn, nt);
1108 out:
1109         return err;
1110 }
1111 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1112
1113 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1114                          struct ip_tunnel_parm *p, __u32 fwmark)
1115 {
1116         struct ip_tunnel *t;
1117         struct ip_tunnel *tunnel = netdev_priv(dev);
1118         struct net *net = tunnel->net;
1119         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1120
1121         if (dev == itn->fb_tunnel_dev)
1122                 return -EINVAL;
1123
1124         t = ip_tunnel_find(itn, p, dev->type);
1125
1126         if (t) {
1127                 if (t->dev != dev)
1128                         return -EEXIST;
1129         } else {
1130                 t = tunnel;
1131
1132                 if (dev->type != ARPHRD_ETHER) {
1133                         unsigned int nflags = 0;
1134
1135                         if (ipv4_is_multicast(p->iph.daddr))
1136                                 nflags = IFF_BROADCAST;
1137                         else if (p->iph.daddr)
1138                                 nflags = IFF_POINTOPOINT;
1139
1140                         if ((dev->flags ^ nflags) &
1141                             (IFF_POINTOPOINT | IFF_BROADCAST))
1142                                 return -EINVAL;
1143                 }
1144         }
1145
1146         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1147         return 0;
1148 }
1149 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1150
1151 int ip_tunnel_init(struct net_device *dev)
1152 {
1153         struct ip_tunnel *tunnel = netdev_priv(dev);
1154         struct iphdr *iph = &tunnel->parms.iph;
1155         int err;
1156
1157         dev->needs_free_netdev = true;
1158         dev->priv_destructor = ip_tunnel_dev_free;
1159         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1160         if (!dev->tstats)
1161                 return -ENOMEM;
1162
1163         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1164         if (err) {
1165                 free_percpu(dev->tstats);
1166                 return err;
1167         }
1168
1169         err = gro_cells_init(&tunnel->gro_cells, dev);
1170         if (err) {
1171                 dst_cache_destroy(&tunnel->dst_cache);
1172                 free_percpu(dev->tstats);
1173                 return err;
1174         }
1175
1176         tunnel->dev = dev;
1177         tunnel->net = dev_net(dev);
1178         strcpy(tunnel->parms.name, dev->name);
1179         iph->version            = 4;
1180         iph->ihl                = 5;
1181
1182         if (tunnel->collect_md) {
1183                 dev->features |= NETIF_F_NETNS_LOCAL;
1184                 netif_keep_dst(dev);
1185         }
1186         return 0;
1187 }
1188 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1189
1190 void ip_tunnel_uninit(struct net_device *dev)
1191 {
1192         struct ip_tunnel *tunnel = netdev_priv(dev);
1193         struct net *net = tunnel->net;
1194         struct ip_tunnel_net *itn;
1195
1196         itn = net_generic(net, tunnel->ip_tnl_net_id);
1197         /* fb_tunnel_dev will be unregisted in net-exit call. */
1198         if (itn->fb_tunnel_dev != dev)
1199                 ip_tunnel_del(itn, netdev_priv(dev));
1200
1201         dst_cache_reset(&tunnel->dst_cache);
1202 }
1203 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1204
1205 /* Do least required initialization, rest of init is done in tunnel_init call */
1206 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1207 {
1208         struct ip_tunnel *tunnel = netdev_priv(dev);
1209         tunnel->ip_tnl_net_id = net_id;
1210 }
1211 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1212
1213 MODULE_LICENSE("GPL");