]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv6/ip6_output.c
Merge branch 'ufs-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[karo-tx-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         skb->protocol = htons(ETH_P_IPV6);
71         skb->dev = dev;
72
73         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
74                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
75
76                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
77                     ((mroute6_socket(net, skb) &&
78                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
79                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
80                                          &ipv6_hdr(skb)->saddr))) {
81                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
82
83                         /* Do not check for IFF_ALLMULTI; multicast routing
84                            is not supported in any case.
85                          */
86                         if (newskb)
87                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
88                                         net, sk, newskb, NULL, newskb->dev,
89                                         dev_loopback_xmit);
90
91                         if (ipv6_hdr(skb)->hop_limit == 0) {
92                                 IP6_INC_STATS(net, idev,
93                                               IPSTATS_MIB_OUTDISCARDS);
94                                 kfree_skb(skb);
95                                 return 0;
96                         }
97                 }
98
99                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
100
101                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
102                     IPV6_ADDR_SCOPE_NODELOCAL &&
103                     !(dev->flags & IFF_LOOPBACK)) {
104                         kfree_skb(skb);
105                         return 0;
106                 }
107         }
108
109         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
110                 int res = lwtunnel_xmit(skb);
111
112                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
113                         return res;
114         }
115
116         rcu_read_lock_bh();
117         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
118         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
119         if (unlikely(!neigh))
120                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
121         if (!IS_ERR(neigh)) {
122                 sock_confirm_neigh(skb, neigh);
123                 ret = neigh_output(neigh, skb);
124                 rcu_read_unlock_bh();
125                 return ret;
126         }
127         rcu_read_unlock_bh();
128
129         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
130         kfree_skb(skb);
131         return -EINVAL;
132 }
133
134 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
135 {
136         int ret;
137
138         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
139         if (ret) {
140                 kfree_skb(skb);
141                 return ret;
142         }
143
144         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
145             dst_allfrag(skb_dst(skb)) ||
146             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
147                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
148         else
149                 return ip6_finish_output2(net, sk, skb);
150 }
151
152 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
153 {
154         struct net_device *dev = skb_dst(skb)->dev;
155         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
156
157         if (unlikely(idev->cnf.disable_ipv6)) {
158                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
159                 kfree_skb(skb);
160                 return 0;
161         }
162
163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
164                             net, sk, skb, NULL, dev,
165                             ip6_finish_output,
166                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
167 }
168
169 /*
170  * xmit an sk_buff (used by TCP, SCTP and DCCP)
171  * Note : socket lock is not held for SYNACK packets, but might be modified
172  * by calls to skb_set_owner_w() and ipv6_local_error(),
173  * which are using proper atomic operations or spinlocks.
174  */
175 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
176              __u32 mark, struct ipv6_txoptions *opt, int tclass)
177 {
178         struct net *net = sock_net(sk);
179         const struct ipv6_pinfo *np = inet6_sk(sk);
180         struct in6_addr *first_hop = &fl6->daddr;
181         struct dst_entry *dst = skb_dst(skb);
182         struct ipv6hdr *hdr;
183         u8  proto = fl6->flowi6_proto;
184         int seg_len = skb->len;
185         int hlimit = -1;
186         u32 mtu;
187
188         if (opt) {
189                 unsigned int head_room;
190
191                 /* First: exthdrs may take lots of space (~8K for now)
192                    MAX_HEADER is not enough.
193                  */
194                 head_room = opt->opt_nflen + opt->opt_flen;
195                 seg_len += head_room;
196                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
197
198                 if (skb_headroom(skb) < head_room) {
199                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
200                         if (!skb2) {
201                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
202                                               IPSTATS_MIB_OUTDISCARDS);
203                                 kfree_skb(skb);
204                                 return -ENOBUFS;
205                         }
206                         consume_skb(skb);
207                         skb = skb2;
208                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
209                          * it is safe to call in our context (socket lock not held)
210                          */
211                         skb_set_owner_w(skb, (struct sock *)sk);
212                 }
213                 if (opt->opt_flen)
214                         ipv6_push_frag_opts(skb, opt, &proto);
215                 if (opt->opt_nflen)
216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
217                                              &fl6->saddr);
218         }
219
220         skb_push(skb, sizeof(struct ipv6hdr));
221         skb_reset_network_header(skb);
222         hdr = ipv6_hdr(skb);
223
224         /*
225          *      Fill in the IPv6 header
226          */
227         if (np)
228                 hlimit = np->hop_limit;
229         if (hlimit < 0)
230                 hlimit = ip6_dst_hoplimit(dst);
231
232         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
233                                                      np->autoflowlabel, fl6));
234
235         hdr->payload_len = htons(seg_len);
236         hdr->nexthdr = proto;
237         hdr->hop_limit = hlimit;
238
239         hdr->saddr = fl6->saddr;
240         hdr->daddr = *first_hop;
241
242         skb->protocol = htons(ETH_P_IPV6);
243         skb->priority = sk->sk_priority;
244         skb->mark = mark;
245
246         mtu = dst_mtu(dst);
247         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
248                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
249                               IPSTATS_MIB_OUT, skb->len);
250
251                 /* if egress device is enslaved to an L3 master device pass the
252                  * skb to its handler for processing
253                  */
254                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
255                 if (unlikely(!skb))
256                         return 0;
257
258                 /* hooks should never assume socket lock is held.
259                  * we promote our socket to non const
260                  */
261                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
262                                net, (struct sock *)sk, skb, NULL, dst->dev,
263                                dst_output);
264         }
265
266         skb->dev = dst->dev;
267         /* ipv6_local_error() does not require socket lock,
268          * we promote our socket to non const
269          */
270         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
271
272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
273         kfree_skb(skb);
274         return -EMSGSIZE;
275 }
276 EXPORT_SYMBOL(ip6_xmit);
277
278 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
279 {
280         struct ip6_ra_chain *ra;
281         struct sock *last = NULL;
282
283         read_lock(&ip6_ra_lock);
284         for (ra = ip6_ra_chain; ra; ra = ra->next) {
285                 struct sock *sk = ra->sk;
286                 if (sk && ra->sel == sel &&
287                     (!sk->sk_bound_dev_if ||
288                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
289                         if (last) {
290                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
291                                 if (skb2)
292                                         rawv6_rcv(last, skb2);
293                         }
294                         last = sk;
295                 }
296         }
297
298         if (last) {
299                 rawv6_rcv(last, skb);
300                 read_unlock(&ip6_ra_lock);
301                 return 1;
302         }
303         read_unlock(&ip6_ra_lock);
304         return 0;
305 }
306
307 static int ip6_forward_proxy_check(struct sk_buff *skb)
308 {
309         struct ipv6hdr *hdr = ipv6_hdr(skb);
310         u8 nexthdr = hdr->nexthdr;
311         __be16 frag_off;
312         int offset;
313
314         if (ipv6_ext_hdr(nexthdr)) {
315                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
316                 if (offset < 0)
317                         return 0;
318         } else
319                 offset = sizeof(struct ipv6hdr);
320
321         if (nexthdr == IPPROTO_ICMPV6) {
322                 struct icmp6hdr *icmp6;
323
324                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
325                                          offset + 1 - skb->data)))
326                         return 0;
327
328                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
329
330                 switch (icmp6->icmp6_type) {
331                 case NDISC_ROUTER_SOLICITATION:
332                 case NDISC_ROUTER_ADVERTISEMENT:
333                 case NDISC_NEIGHBOUR_SOLICITATION:
334                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
335                 case NDISC_REDIRECT:
336                         /* For reaction involving unicast neighbor discovery
337                          * message destined to the proxied address, pass it to
338                          * input function.
339                          */
340                         return 1;
341                 default:
342                         break;
343                 }
344         }
345
346         /*
347          * The proxying router can't forward traffic sent to a link-local
348          * address, so signal the sender and discard the packet. This
349          * behavior is clarified by the MIPv6 specification.
350          */
351         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
352                 dst_link_failure(skb);
353                 return -1;
354         }
355
356         return 0;
357 }
358
359 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
360                                      struct sk_buff *skb)
361 {
362         return dst_output(net, sk, skb);
363 }
364
365 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
366 {
367         unsigned int mtu;
368         struct inet6_dev *idev;
369
370         if (dst_metric_locked(dst, RTAX_MTU)) {
371                 mtu = dst_metric_raw(dst, RTAX_MTU);
372                 if (mtu)
373                         return mtu;
374         }
375
376         mtu = IPV6_MIN_MTU;
377         rcu_read_lock();
378         idev = __in6_dev_get(dst->dev);
379         if (idev)
380                 mtu = idev->cnf.mtu6;
381         rcu_read_unlock();
382
383         return mtu;
384 }
385
386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 {
388         if (skb->len <= mtu)
389                 return false;
390
391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
393                 return true;
394
395         if (skb->ignore_df)
396                 return false;
397
398         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
399                 return false;
400
401         return true;
402 }
403
404 int ip6_forward(struct sk_buff *skb)
405 {
406         struct dst_entry *dst = skb_dst(skb);
407         struct ipv6hdr *hdr = ipv6_hdr(skb);
408         struct inet6_skb_parm *opt = IP6CB(skb);
409         struct net *net = dev_net(dst->dev);
410         u32 mtu;
411
412         if (net->ipv6.devconf_all->forwarding == 0)
413                 goto error;
414
415         if (skb->pkt_type != PACKET_HOST)
416                 goto drop;
417
418         if (unlikely(skb->sk))
419                 goto drop;
420
421         if (skb_warn_if_lro(skb))
422                 goto drop;
423
424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
426                                 IPSTATS_MIB_INDISCARDS);
427                 goto drop;
428         }
429
430         skb_forward_csum(skb);
431
432         /*
433          *      We DO NOT make any processing on
434          *      RA packets, pushing them to user level AS IS
435          *      without ane WARRANTY that application will be able
436          *      to interpret them. The reason is that we
437          *      cannot make anything clever here.
438          *
439          *      We are not end-node, so that if packet contains
440          *      AH/ESP, we cannot make anything.
441          *      Defragmentation also would be mistake, RA packets
442          *      cannot be fragmented, because there is no warranty
443          *      that different fragments will go along one path. --ANK
444          */
445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
447                         return 0;
448         }
449
450         /*
451          *      check and decrement ttl
452          */
453         if (hdr->hop_limit <= 1) {
454                 /* Force OUTPUT device used as source address */
455                 skb->dev = dst->dev;
456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
457                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
458                                 IPSTATS_MIB_INHDRERRORS);
459
460                 kfree_skb(skb);
461                 return -ETIMEDOUT;
462         }
463
464         /* XXX: idev->cnf.proxy_ndp? */
465         if (net->ipv6.devconf_all->proxy_ndp &&
466             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
467                 int proxied = ip6_forward_proxy_check(skb);
468                 if (proxied > 0)
469                         return ip6_input(skb);
470                 else if (proxied < 0) {
471                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
472                                         IPSTATS_MIB_INDISCARDS);
473                         goto drop;
474                 }
475         }
476
477         if (!xfrm6_route_forward(skb)) {
478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
479                                 IPSTATS_MIB_INDISCARDS);
480                 goto drop;
481         }
482         dst = skb_dst(skb);
483
484         /* IPv6 specs say nothing about it, but it is clear that we cannot
485            send redirects to source routed frames.
486            We don't send redirects to frames decapsulated from IPsec.
487          */
488         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
489                 struct in6_addr *target = NULL;
490                 struct inet_peer *peer;
491                 struct rt6_info *rt;
492
493                 /*
494                  *      incoming and outgoing devices are the same
495                  *      send a redirect.
496                  */
497
498                 rt = (struct rt6_info *) dst;
499                 if (rt->rt6i_flags & RTF_GATEWAY)
500                         target = &rt->rt6i_gateway;
501                 else
502                         target = &hdr->daddr;
503
504                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
505
506                 /* Limit redirects both by destination (here)
507                    and by source (inside ndisc_send_redirect)
508                  */
509                 if (inet_peer_xrlim_allow(peer, 1*HZ))
510                         ndisc_send_redirect(skb, target);
511                 if (peer)
512                         inet_putpeer(peer);
513         } else {
514                 int addrtype = ipv6_addr_type(&hdr->saddr);
515
516                 /* This check is security critical. */
517                 if (addrtype == IPV6_ADDR_ANY ||
518                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
519                         goto error;
520                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
521                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
522                                     ICMPV6_NOT_NEIGHBOUR, 0);
523                         goto error;
524                 }
525         }
526
527         mtu = ip6_dst_mtu_forward(dst);
528         if (mtu < IPV6_MIN_MTU)
529                 mtu = IPV6_MIN_MTU;
530
531         if (ip6_pkt_too_big(skb, mtu)) {
532                 /* Again, force OUTPUT device used as source address */
533                 skb->dev = dst->dev;
534                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
536                                 IPSTATS_MIB_INTOOBIGERRORS);
537                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
538                                 IPSTATS_MIB_FRAGFAILS);
539                 kfree_skb(skb);
540                 return -EMSGSIZE;
541         }
542
543         if (skb_cow(skb, dst->dev->hard_header_len)) {
544                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
545                                 IPSTATS_MIB_OUTDISCARDS);
546                 goto drop;
547         }
548
549         hdr = ipv6_hdr(skb);
550
551         /* Mangling hops number delayed to point after skb COW */
552
553         hdr->hop_limit--;
554
555         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
556         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
557         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
558                        net, NULL, skb, skb->dev, dst->dev,
559                        ip6_forward_finish);
560
561 error:
562         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
563 drop:
564         kfree_skb(skb);
565         return -EINVAL;
566 }
567
568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
569 {
570         to->pkt_type = from->pkt_type;
571         to->priority = from->priority;
572         to->protocol = from->protocol;
573         skb_dst_drop(to);
574         skb_dst_set(to, dst_clone(skb_dst(from)));
575         to->dev = from->dev;
576         to->mark = from->mark;
577
578 #ifdef CONFIG_NET_SCHED
579         to->tc_index = from->tc_index;
580 #endif
581         nf_copy(to, from);
582         skb_copy_secmark(to, from);
583 }
584
585 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
586                  int (*output)(struct net *, struct sock *, struct sk_buff *))
587 {
588         struct sk_buff *frag;
589         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
590         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
591                                 inet6_sk(skb->sk) : NULL;
592         struct ipv6hdr *tmp_hdr;
593         struct frag_hdr *fh;
594         unsigned int mtu, hlen, left, len;
595         int hroom, troom;
596         __be32 frag_id;
597         int ptr, offset = 0, err = 0;
598         u8 *prevhdr, nexthdr = 0;
599
600         err = ip6_find_1stfragopt(skb, &prevhdr);
601         if (err < 0)
602                 goto fail;
603         hlen = err;
604         nexthdr = *prevhdr;
605
606         mtu = ip6_skb_dst_mtu(skb);
607
608         /* We must not fragment if the socket is set to force MTU discovery
609          * or if the skb it not generated by a local socket.
610          */
611         if (unlikely(!skb->ignore_df && skb->len > mtu))
612                 goto fail_toobig;
613
614         if (IP6CB(skb)->frag_max_size) {
615                 if (IP6CB(skb)->frag_max_size > mtu)
616                         goto fail_toobig;
617
618                 /* don't send fragments larger than what we received */
619                 mtu = IP6CB(skb)->frag_max_size;
620                 if (mtu < IPV6_MIN_MTU)
621                         mtu = IPV6_MIN_MTU;
622         }
623
624         if (np && np->frag_size < mtu) {
625                 if (np->frag_size)
626                         mtu = np->frag_size;
627         }
628         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
629                 goto fail_toobig;
630         mtu -= hlen + sizeof(struct frag_hdr);
631
632         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
633                                     &ipv6_hdr(skb)->saddr);
634
635         if (skb->ip_summed == CHECKSUM_PARTIAL &&
636             (err = skb_checksum_help(skb)))
637                 goto fail;
638
639         hroom = LL_RESERVED_SPACE(rt->dst.dev);
640         if (skb_has_frag_list(skb)) {
641                 unsigned int first_len = skb_pagelen(skb);
642                 struct sk_buff *frag2;
643
644                 if (first_len - hlen > mtu ||
645                     ((first_len - hlen) & 7) ||
646                     skb_cloned(skb) ||
647                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
648                         goto slow_path;
649
650                 skb_walk_frags(skb, frag) {
651                         /* Correct geometry. */
652                         if (frag->len > mtu ||
653                             ((frag->len & 7) && frag->next) ||
654                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
655                                 goto slow_path_clean;
656
657                         /* Partially cloned skb? */
658                         if (skb_shared(frag))
659                                 goto slow_path_clean;
660
661                         BUG_ON(frag->sk);
662                         if (skb->sk) {
663                                 frag->sk = skb->sk;
664                                 frag->destructor = sock_wfree;
665                         }
666                         skb->truesize -= frag->truesize;
667                 }
668
669                 err = 0;
670                 offset = 0;
671                 /* BUILD HEADER */
672
673                 *prevhdr = NEXTHDR_FRAGMENT;
674                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
675                 if (!tmp_hdr) {
676                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
677                                       IPSTATS_MIB_FRAGFAILS);
678                         err = -ENOMEM;
679                         goto fail;
680                 }
681                 frag = skb_shinfo(skb)->frag_list;
682                 skb_frag_list_init(skb);
683
684                 __skb_pull(skb, hlen);
685                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
686                 __skb_push(skb, hlen);
687                 skb_reset_network_header(skb);
688                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
689
690                 fh->nexthdr = nexthdr;
691                 fh->reserved = 0;
692                 fh->frag_off = htons(IP6_MF);
693                 fh->identification = frag_id;
694
695                 first_len = skb_pagelen(skb);
696                 skb->data_len = first_len - skb_headlen(skb);
697                 skb->len = first_len;
698                 ipv6_hdr(skb)->payload_len = htons(first_len -
699                                                    sizeof(struct ipv6hdr));
700
701                 dst_hold(&rt->dst);
702
703                 for (;;) {
704                         /* Prepare header of the next frame,
705                          * before previous one went down. */
706                         if (frag) {
707                                 frag->ip_summed = CHECKSUM_NONE;
708                                 skb_reset_transport_header(frag);
709                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
710                                 __skb_push(frag, hlen);
711                                 skb_reset_network_header(frag);
712                                 memcpy(skb_network_header(frag), tmp_hdr,
713                                        hlen);
714                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
715                                 fh->nexthdr = nexthdr;
716                                 fh->reserved = 0;
717                                 fh->frag_off = htons(offset);
718                                 if (frag->next)
719                                         fh->frag_off |= htons(IP6_MF);
720                                 fh->identification = frag_id;
721                                 ipv6_hdr(frag)->payload_len =
722                                                 htons(frag->len -
723                                                       sizeof(struct ipv6hdr));
724                                 ip6_copy_metadata(frag, skb);
725                         }
726
727                         err = output(net, sk, skb);
728                         if (!err)
729                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
730                                               IPSTATS_MIB_FRAGCREATES);
731
732                         if (err || !frag)
733                                 break;
734
735                         skb = frag;
736                         frag = skb->next;
737                         skb->next = NULL;
738                 }
739
740                 kfree(tmp_hdr);
741
742                 if (err == 0) {
743                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744                                       IPSTATS_MIB_FRAGOKS);
745                         ip6_rt_put(rt);
746                         return 0;
747                 }
748
749                 kfree_skb_list(frag);
750
751                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
752                               IPSTATS_MIB_FRAGFAILS);
753                 ip6_rt_put(rt);
754                 return err;
755
756 slow_path_clean:
757                 skb_walk_frags(skb, frag2) {
758                         if (frag2 == frag)
759                                 break;
760                         frag2->sk = NULL;
761                         frag2->destructor = NULL;
762                         skb->truesize += frag2->truesize;
763                 }
764         }
765
766 slow_path:
767         left = skb->len - hlen;         /* Space per frame */
768         ptr = hlen;                     /* Where to start from */
769
770         /*
771          *      Fragment the datagram.
772          */
773
774         troom = rt->dst.dev->needed_tailroom;
775
776         /*
777          *      Keep copying data until we run out.
778          */
779         while (left > 0)        {
780                 u8 *fragnexthdr_offset;
781
782                 len = left;
783                 /* IF: it doesn't fit, use 'mtu' - the data space left */
784                 if (len > mtu)
785                         len = mtu;
786                 /* IF: we are not sending up to and including the packet end
787                    then align the next start on an eight byte boundary */
788                 if (len < left) {
789                         len &= ~7;
790                 }
791
792                 /* Allocate buffer */
793                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
794                                  hroom + troom, GFP_ATOMIC);
795                 if (!frag) {
796                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
797                                       IPSTATS_MIB_FRAGFAILS);
798                         err = -ENOMEM;
799                         goto fail;
800                 }
801
802                 /*
803                  *      Set up data on packet
804                  */
805
806                 ip6_copy_metadata(frag, skb);
807                 skb_reserve(frag, hroom);
808                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
809                 skb_reset_network_header(frag);
810                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
811                 frag->transport_header = (frag->network_header + hlen +
812                                           sizeof(struct frag_hdr));
813
814                 /*
815                  *      Charge the memory for the fragment to any owner
816                  *      it might possess
817                  */
818                 if (skb->sk)
819                         skb_set_owner_w(frag, skb->sk);
820
821                 /*
822                  *      Copy the packet header into the new buffer.
823                  */
824                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
825
826                 fragnexthdr_offset = skb_network_header(frag);
827                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
828                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
829
830                 /*
831                  *      Build fragment header.
832                  */
833                 fh->nexthdr = nexthdr;
834                 fh->reserved = 0;
835                 fh->identification = frag_id;
836
837                 /*
838                  *      Copy a block of the IP datagram.
839                  */
840                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
841                                      len));
842                 left -= len;
843
844                 fh->frag_off = htons(offset);
845                 if (left > 0)
846                         fh->frag_off |= htons(IP6_MF);
847                 ipv6_hdr(frag)->payload_len = htons(frag->len -
848                                                     sizeof(struct ipv6hdr));
849
850                 ptr += len;
851                 offset += len;
852
853                 /*
854                  *      Put this fragment into the sending queue.
855                  */
856                 err = output(net, sk, frag);
857                 if (err)
858                         goto fail;
859
860                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861                               IPSTATS_MIB_FRAGCREATES);
862         }
863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864                       IPSTATS_MIB_FRAGOKS);
865         consume_skb(skb);
866         return err;
867
868 fail_toobig:
869         if (skb->sk && dst_allfrag(skb_dst(skb)))
870                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
871
872         skb->dev = skb_dst(skb)->dev;
873         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
874         err = -EMSGSIZE;
875
876 fail:
877         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
878                       IPSTATS_MIB_FRAGFAILS);
879         kfree_skb(skb);
880         return err;
881 }
882
883 static inline int ip6_rt_check(const struct rt6key *rt_key,
884                                const struct in6_addr *fl_addr,
885                                const struct in6_addr *addr_cache)
886 {
887         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
888                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
889 }
890
891 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
892                                           struct dst_entry *dst,
893                                           const struct flowi6 *fl6)
894 {
895         struct ipv6_pinfo *np = inet6_sk(sk);
896         struct rt6_info *rt;
897
898         if (!dst)
899                 goto out;
900
901         if (dst->ops->family != AF_INET6) {
902                 dst_release(dst);
903                 return NULL;
904         }
905
906         rt = (struct rt6_info *)dst;
907         /* Yes, checking route validity in not connected
908          * case is not very simple. Take into account,
909          * that we do not support routing by source, TOS,
910          * and MSG_DONTROUTE            --ANK (980726)
911          *
912          * 1. ip6_rt_check(): If route was host route,
913          *    check that cached destination is current.
914          *    If it is network route, we still may
915          *    check its validity using saved pointer
916          *    to the last used address: daddr_cache.
917          *    We do not want to save whole address now,
918          *    (because main consumer of this service
919          *    is tcp, which has not this problem),
920          *    so that the last trick works only on connected
921          *    sockets.
922          * 2. oif also should be the same.
923          */
924         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
925 #ifdef CONFIG_IPV6_SUBTREES
926             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
927 #endif
928            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
929               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
930                 dst_release(dst);
931                 dst = NULL;
932         }
933
934 out:
935         return dst;
936 }
937
938 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
939                                struct dst_entry **dst, struct flowi6 *fl6)
940 {
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942         struct neighbour *n;
943         struct rt6_info *rt;
944 #endif
945         int err;
946         int flags = 0;
947
948         /* The correct way to handle this would be to do
949          * ip6_route_get_saddr, and then ip6_route_output; however,
950          * the route-specific preferred source forces the
951          * ip6_route_output call _before_ ip6_route_get_saddr.
952          *
953          * In source specific routing (no src=any default route),
954          * ip6_route_output will fail given src=any saddr, though, so
955          * that's why we try it again later.
956          */
957         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
958                 struct rt6_info *rt;
959                 bool had_dst = *dst != NULL;
960
961                 if (!had_dst)
962                         *dst = ip6_route_output(net, sk, fl6);
963                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
964                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
965                                           sk ? inet6_sk(sk)->srcprefs : 0,
966                                           &fl6->saddr);
967                 if (err)
968                         goto out_err_release;
969
970                 /* If we had an erroneous initial result, pretend it
971                  * never existed and let the SA-enabled version take
972                  * over.
973                  */
974                 if (!had_dst && (*dst)->error) {
975                         dst_release(*dst);
976                         *dst = NULL;
977                 }
978
979                 if (fl6->flowi6_oif)
980                         flags |= RT6_LOOKUP_F_IFACE;
981         }
982
983         if (!*dst)
984                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
985
986         err = (*dst)->error;
987         if (err)
988                 goto out_err_release;
989
990 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
991         /*
992          * Here if the dst entry we've looked up
993          * has a neighbour entry that is in the INCOMPLETE
994          * state and the src address from the flow is
995          * marked as OPTIMISTIC, we release the found
996          * dst entry and replace it instead with the
997          * dst entry of the nexthop router
998          */
999         rt = (struct rt6_info *) *dst;
1000         rcu_read_lock_bh();
1001         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1002                                       rt6_nexthop(rt, &fl6->daddr));
1003         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1004         rcu_read_unlock_bh();
1005
1006         if (err) {
1007                 struct inet6_ifaddr *ifp;
1008                 struct flowi6 fl_gw6;
1009                 int redirect;
1010
1011                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1012                                       (*dst)->dev, 1);
1013
1014                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1015                 if (ifp)
1016                         in6_ifa_put(ifp);
1017
1018                 if (redirect) {
1019                         /*
1020                          * We need to get the dst entry for the
1021                          * default router instead
1022                          */
1023                         dst_release(*dst);
1024                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1025                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1026                         *dst = ip6_route_output(net, sk, &fl_gw6);
1027                         err = (*dst)->error;
1028                         if (err)
1029                                 goto out_err_release;
1030                 }
1031         }
1032 #endif
1033         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1034             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1035                 err = -EAFNOSUPPORT;
1036                 goto out_err_release;
1037         }
1038
1039         return 0;
1040
1041 out_err_release:
1042         dst_release(*dst);
1043         *dst = NULL;
1044
1045         if (err == -ENETUNREACH)
1046                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1047         return err;
1048 }
1049
1050 /**
1051  *      ip6_dst_lookup - perform route lookup on flow
1052  *      @sk: socket which provides route info
1053  *      @dst: pointer to dst_entry * for result
1054  *      @fl6: flow to lookup
1055  *
1056  *      This function performs a route lookup on the given flow.
1057  *
1058  *      It returns zero on success, or a standard errno code on error.
1059  */
1060 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1061                    struct flowi6 *fl6)
1062 {
1063         *dst = NULL;
1064         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1065 }
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1067
1068 /**
1069  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1070  *      @sk: socket which provides route info
1071  *      @fl6: flow to lookup
1072  *      @final_dst: final destination address for ipsec lookup
1073  *
1074  *      This function performs a route lookup on the given flow.
1075  *
1076  *      It returns a valid dst pointer on success, or a pointer encoded
1077  *      error code.
1078  */
1079 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1080                                       const struct in6_addr *final_dst)
1081 {
1082         struct dst_entry *dst = NULL;
1083         int err;
1084
1085         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1086         if (err)
1087                 return ERR_PTR(err);
1088         if (final_dst)
1089                 fl6->daddr = *final_dst;
1090
1091         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1092 }
1093 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1094
1095 /**
1096  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1097  *      @sk: socket which provides the dst cache and route info
1098  *      @fl6: flow to lookup
1099  *      @final_dst: final destination address for ipsec lookup
1100  *
1101  *      This function performs a route lookup on the given flow with the
1102  *      possibility of using the cached route in the socket if it is valid.
1103  *      It will take the socket dst lock when operating on the dst cache.
1104  *      As a result, this function can only be used in process context.
1105  *
1106  *      It returns a valid dst pointer on success, or a pointer encoded
1107  *      error code.
1108  */
1109 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1110                                          const struct in6_addr *final_dst)
1111 {
1112         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1113
1114         dst = ip6_sk_dst_check(sk, dst, fl6);
1115         if (!dst)
1116                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1117
1118         return dst;
1119 }
1120 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1121
1122 static inline int ip6_ufo_append_data(struct sock *sk,
1123                         struct sk_buff_head *queue,
1124                         int getfrag(void *from, char *to, int offset, int len,
1125                         int odd, struct sk_buff *skb),
1126                         void *from, int length, int hh_len, int fragheaderlen,
1127                         int exthdrlen, int transhdrlen, int mtu,
1128                         unsigned int flags, const struct flowi6 *fl6)
1129
1130 {
1131         struct sk_buff *skb;
1132         int err;
1133
1134         /* There is support for UDP large send offload by network
1135          * device, so create one single skb packet containing complete
1136          * udp datagram
1137          */
1138         skb = skb_peek_tail(queue);
1139         if (!skb) {
1140                 skb = sock_alloc_send_skb(sk,
1141                         hh_len + fragheaderlen + transhdrlen + 20,
1142                         (flags & MSG_DONTWAIT), &err);
1143                 if (!skb)
1144                         return err;
1145
1146                 /* reserve space for Hardware header */
1147                 skb_reserve(skb, hh_len);
1148
1149                 /* create space for UDP/IP header */
1150                 skb_put(skb, fragheaderlen + transhdrlen);
1151
1152                 /* initialize network header pointer */
1153                 skb_set_network_header(skb, exthdrlen);
1154
1155                 /* initialize protocol header pointer */
1156                 skb->transport_header = skb->network_header + fragheaderlen;
1157
1158                 skb->protocol = htons(ETH_P_IPV6);
1159                 skb->csum = 0;
1160
1161                 if (flags & MSG_CONFIRM)
1162                         skb_set_dst_pending_confirm(skb, 1);
1163
1164                 __skb_queue_tail(queue, skb);
1165         } else if (skb_is_gso(skb)) {
1166                 goto append;
1167         }
1168
1169         skb->ip_summed = CHECKSUM_PARTIAL;
1170         /* Specify the length of each IPv6 datagram fragment.
1171          * It has to be a multiple of 8.
1172          */
1173         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1174                                      sizeof(struct frag_hdr)) & ~7;
1175         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1176         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1177                                                          &fl6->daddr,
1178                                                          &fl6->saddr);
1179
1180 append:
1181         return skb_append_datato_frags(sk, skb, getfrag, from,
1182                                        (length - transhdrlen));
1183 }
1184
1185 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1186                                                gfp_t gfp)
1187 {
1188         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1189 }
1190
1191 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1192                                                 gfp_t gfp)
1193 {
1194         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1195 }
1196
1197 static void ip6_append_data_mtu(unsigned int *mtu,
1198                                 int *maxfraglen,
1199                                 unsigned int fragheaderlen,
1200                                 struct sk_buff *skb,
1201                                 struct rt6_info *rt,
1202                                 unsigned int orig_mtu)
1203 {
1204         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1205                 if (!skb) {
1206                         /* first fragment, reserve header_len */
1207                         *mtu = orig_mtu - rt->dst.header_len;
1208
1209                 } else {
1210                         /*
1211                          * this fragment is not first, the headers
1212                          * space is regarded as data space.
1213                          */
1214                         *mtu = orig_mtu;
1215                 }
1216                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1217                               + fragheaderlen - sizeof(struct frag_hdr);
1218         }
1219 }
1220
1221 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1222                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1223                           struct rt6_info *rt, struct flowi6 *fl6)
1224 {
1225         struct ipv6_pinfo *np = inet6_sk(sk);
1226         unsigned int mtu;
1227         struct ipv6_txoptions *opt = ipc6->opt;
1228
1229         /*
1230          * setup for corking
1231          */
1232         if (opt) {
1233                 if (WARN_ON(v6_cork->opt))
1234                         return -EINVAL;
1235
1236                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1237                 if (unlikely(!v6_cork->opt))
1238                         return -ENOBUFS;
1239
1240                 v6_cork->opt->tot_len = opt->tot_len;
1241                 v6_cork->opt->opt_flen = opt->opt_flen;
1242                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1243
1244                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1245                                                     sk->sk_allocation);
1246                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1247                         return -ENOBUFS;
1248
1249                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1250                                                     sk->sk_allocation);
1251                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1252                         return -ENOBUFS;
1253
1254                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1255                                                    sk->sk_allocation);
1256                 if (opt->hopopt && !v6_cork->opt->hopopt)
1257                         return -ENOBUFS;
1258
1259                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1260                                                     sk->sk_allocation);
1261                 if (opt->srcrt && !v6_cork->opt->srcrt)
1262                         return -ENOBUFS;
1263
1264                 /* need source address above miyazawa*/
1265         }
1266         dst_hold(&rt->dst);
1267         cork->base.dst = &rt->dst;
1268         cork->fl.u.ip6 = *fl6;
1269         v6_cork->hop_limit = ipc6->hlimit;
1270         v6_cork->tclass = ipc6->tclass;
1271         if (rt->dst.flags & DST_XFRM_TUNNEL)
1272                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1273                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1274         else
1275                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1276                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1277         if (np->frag_size < mtu) {
1278                 if (np->frag_size)
1279                         mtu = np->frag_size;
1280         }
1281         cork->base.fragsize = mtu;
1282         if (dst_allfrag(rt->dst.path))
1283                 cork->base.flags |= IPCORK_ALLFRAG;
1284         cork->base.length = 0;
1285
1286         return 0;
1287 }
1288
1289 static int __ip6_append_data(struct sock *sk,
1290                              struct flowi6 *fl6,
1291                              struct sk_buff_head *queue,
1292                              struct inet_cork *cork,
1293                              struct inet6_cork *v6_cork,
1294                              struct page_frag *pfrag,
1295                              int getfrag(void *from, char *to, int offset,
1296                                          int len, int odd, struct sk_buff *skb),
1297                              void *from, int length, int transhdrlen,
1298                              unsigned int flags, struct ipcm6_cookie *ipc6,
1299                              const struct sockcm_cookie *sockc)
1300 {
1301         struct sk_buff *skb, *skb_prev = NULL;
1302         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1303         int exthdrlen = 0;
1304         int dst_exthdrlen = 0;
1305         int hh_len;
1306         int copy;
1307         int err;
1308         int offset = 0;
1309         __u8 tx_flags = 0;
1310         u32 tskey = 0;
1311         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1312         struct ipv6_txoptions *opt = v6_cork->opt;
1313         int csummode = CHECKSUM_NONE;
1314         unsigned int maxnonfragsize, headersize;
1315
1316         skb = skb_peek_tail(queue);
1317         if (!skb) {
1318                 exthdrlen = opt ? opt->opt_flen : 0;
1319                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1320         }
1321
1322         mtu = cork->fragsize;
1323         orig_mtu = mtu;
1324
1325         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1326
1327         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1328                         (opt ? opt->opt_nflen : 0);
1329         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1330                      sizeof(struct frag_hdr);
1331
1332         headersize = sizeof(struct ipv6hdr) +
1333                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1334                      (dst_allfrag(&rt->dst) ?
1335                       sizeof(struct frag_hdr) : 0) +
1336                      rt->rt6i_nfheader_len;
1337
1338         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1339             (sk->sk_protocol == IPPROTO_UDP ||
1340              sk->sk_protocol == IPPROTO_RAW)) {
1341                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1342                                 sizeof(struct ipv6hdr));
1343                 goto emsgsize;
1344         }
1345
1346         if (ip6_sk_ignore_df(sk))
1347                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1348         else
1349                 maxnonfragsize = mtu;
1350
1351         if (cork->length + length > maxnonfragsize - headersize) {
1352 emsgsize:
1353                 ipv6_local_error(sk, EMSGSIZE, fl6,
1354                                  mtu - headersize +
1355                                  sizeof(struct ipv6hdr));
1356                 return -EMSGSIZE;
1357         }
1358
1359         /* CHECKSUM_PARTIAL only with no extension headers and when
1360          * we are not going to fragment
1361          */
1362         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1363             headersize == sizeof(struct ipv6hdr) &&
1364             length <= mtu - headersize &&
1365             !(flags & MSG_MORE) &&
1366             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1367                 csummode = CHECKSUM_PARTIAL;
1368
1369         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1370                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1371                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1372                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1373                         tskey = sk->sk_tskey++;
1374         }
1375
1376         /*
1377          * Let's try using as much space as possible.
1378          * Use MTU if total length of the message fits into the MTU.
1379          * Otherwise, we need to reserve fragment header and
1380          * fragment alignment (= 8-15 octects, in total).
1381          *
1382          * Note that we may need to "move" the data from the tail of
1383          * of the buffer to the new fragment when we split
1384          * the message.
1385          *
1386          * FIXME: It may be fragmented into multiple chunks
1387          *        at once if non-fragmentable extension headers
1388          *        are too large.
1389          * --yoshfuji
1390          */
1391
1392         cork->length += length;
1393         if ((((length + fragheaderlen) > mtu) ||
1394              (skb && skb_is_gso(skb))) &&
1395             (sk->sk_protocol == IPPROTO_UDP) &&
1396             (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1397             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1398                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1399                                           hh_len, fragheaderlen, exthdrlen,
1400                                           transhdrlen, mtu, flags, fl6);
1401                 if (err)
1402                         goto error;
1403                 return 0;
1404         }
1405
1406         if (!skb)
1407                 goto alloc_new_skb;
1408
1409         while (length > 0) {
1410                 /* Check if the remaining data fits into current packet. */
1411                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1412                 if (copy < length)
1413                         copy = maxfraglen - skb->len;
1414
1415                 if (copy <= 0) {
1416                         char *data;
1417                         unsigned int datalen;
1418                         unsigned int fraglen;
1419                         unsigned int fraggap;
1420                         unsigned int alloclen;
1421 alloc_new_skb:
1422                         /* There's no room in the current skb */
1423                         if (skb)
1424                                 fraggap = skb->len - maxfraglen;
1425                         else
1426                                 fraggap = 0;
1427                         /* update mtu and maxfraglen if necessary */
1428                         if (!skb || !skb_prev)
1429                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1430                                                     fragheaderlen, skb, rt,
1431                                                     orig_mtu);
1432
1433                         skb_prev = skb;
1434
1435                         /*
1436                          * If remaining data exceeds the mtu,
1437                          * we know we need more fragment(s).
1438                          */
1439                         datalen = length + fraggap;
1440
1441                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1442                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1443                         if ((flags & MSG_MORE) &&
1444                             !(rt->dst.dev->features&NETIF_F_SG))
1445                                 alloclen = mtu;
1446                         else
1447                                 alloclen = datalen + fragheaderlen;
1448
1449                         alloclen += dst_exthdrlen;
1450
1451                         if (datalen != length + fraggap) {
1452                                 /*
1453                                  * this is not the last fragment, the trailer
1454                                  * space is regarded as data space.
1455                                  */
1456                                 datalen += rt->dst.trailer_len;
1457                         }
1458
1459                         alloclen += rt->dst.trailer_len;
1460                         fraglen = datalen + fragheaderlen;
1461
1462                         /*
1463                          * We just reserve space for fragment header.
1464                          * Note: this may be overallocation if the message
1465                          * (without MSG_MORE) fits into the MTU.
1466                          */
1467                         alloclen += sizeof(struct frag_hdr);
1468
1469                         copy = datalen - transhdrlen - fraggap;
1470                         if (copy < 0) {
1471                                 err = -EINVAL;
1472                                 goto error;
1473                         }
1474                         if (transhdrlen) {
1475                                 skb = sock_alloc_send_skb(sk,
1476                                                 alloclen + hh_len,
1477                                                 (flags & MSG_DONTWAIT), &err);
1478                         } else {
1479                                 skb = NULL;
1480                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1481                                     2 * sk->sk_sndbuf)
1482                                         skb = sock_wmalloc(sk,
1483                                                            alloclen + hh_len, 1,
1484                                                            sk->sk_allocation);
1485                                 if (unlikely(!skb))
1486                                         err = -ENOBUFS;
1487                         }
1488                         if (!skb)
1489                                 goto error;
1490                         /*
1491                          *      Fill in the control structures
1492                          */
1493                         skb->protocol = htons(ETH_P_IPV6);
1494                         skb->ip_summed = csummode;
1495                         skb->csum = 0;
1496                         /* reserve for fragmentation and ipsec header */
1497                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1498                                     dst_exthdrlen);
1499
1500                         /* Only the initial fragment is time stamped */
1501                         skb_shinfo(skb)->tx_flags = tx_flags;
1502                         tx_flags = 0;
1503                         skb_shinfo(skb)->tskey = tskey;
1504                         tskey = 0;
1505
1506                         /*
1507                          *      Find where to start putting bytes
1508                          */
1509                         data = skb_put(skb, fraglen);
1510                         skb_set_network_header(skb, exthdrlen);
1511                         data += fragheaderlen;
1512                         skb->transport_header = (skb->network_header +
1513                                                  fragheaderlen);
1514                         if (fraggap) {
1515                                 skb->csum = skb_copy_and_csum_bits(
1516                                         skb_prev, maxfraglen,
1517                                         data + transhdrlen, fraggap, 0);
1518                                 skb_prev->csum = csum_sub(skb_prev->csum,
1519                                                           skb->csum);
1520                                 data += fraggap;
1521                                 pskb_trim_unique(skb_prev, maxfraglen);
1522                         }
1523                         if (copy > 0 &&
1524                             getfrag(from, data + transhdrlen, offset,
1525                                     copy, fraggap, skb) < 0) {
1526                                 err = -EFAULT;
1527                                 kfree_skb(skb);
1528                                 goto error;
1529                         }
1530
1531                         offset += copy;
1532                         length -= datalen - fraggap;
1533                         transhdrlen = 0;
1534                         exthdrlen = 0;
1535                         dst_exthdrlen = 0;
1536
1537                         if ((flags & MSG_CONFIRM) && !skb_prev)
1538                                 skb_set_dst_pending_confirm(skb, 1);
1539
1540                         /*
1541                          * Put the packet on the pending queue
1542                          */
1543                         __skb_queue_tail(queue, skb);
1544                         continue;
1545                 }
1546
1547                 if (copy > length)
1548                         copy = length;
1549
1550                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1551                         unsigned int off;
1552
1553                         off = skb->len;
1554                         if (getfrag(from, skb_put(skb, copy),
1555                                                 offset, copy, off, skb) < 0) {
1556                                 __skb_trim(skb, off);
1557                                 err = -EFAULT;
1558                                 goto error;
1559                         }
1560                 } else {
1561                         int i = skb_shinfo(skb)->nr_frags;
1562
1563                         err = -ENOMEM;
1564                         if (!sk_page_frag_refill(sk, pfrag))
1565                                 goto error;
1566
1567                         if (!skb_can_coalesce(skb, i, pfrag->page,
1568                                               pfrag->offset)) {
1569                                 err = -EMSGSIZE;
1570                                 if (i == MAX_SKB_FRAGS)
1571                                         goto error;
1572
1573                                 __skb_fill_page_desc(skb, i, pfrag->page,
1574                                                      pfrag->offset, 0);
1575                                 skb_shinfo(skb)->nr_frags = ++i;
1576                                 get_page(pfrag->page);
1577                         }
1578                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1579                         if (getfrag(from,
1580                                     page_address(pfrag->page) + pfrag->offset,
1581                                     offset, copy, skb->len, skb) < 0)
1582                                 goto error_efault;
1583
1584                         pfrag->offset += copy;
1585                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1586                         skb->len += copy;
1587                         skb->data_len += copy;
1588                         skb->truesize += copy;
1589                         atomic_add(copy, &sk->sk_wmem_alloc);
1590                 }
1591                 offset += copy;
1592                 length -= copy;
1593         }
1594
1595         return 0;
1596
1597 error_efault:
1598         err = -EFAULT;
1599 error:
1600         cork->length -= length;
1601         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1602         return err;
1603 }
1604
1605 int ip6_append_data(struct sock *sk,
1606                     int getfrag(void *from, char *to, int offset, int len,
1607                                 int odd, struct sk_buff *skb),
1608                     void *from, int length, int transhdrlen,
1609                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1610                     struct rt6_info *rt, unsigned int flags,
1611                     const struct sockcm_cookie *sockc)
1612 {
1613         struct inet_sock *inet = inet_sk(sk);
1614         struct ipv6_pinfo *np = inet6_sk(sk);
1615         int exthdrlen;
1616         int err;
1617
1618         if (flags&MSG_PROBE)
1619                 return 0;
1620         if (skb_queue_empty(&sk->sk_write_queue)) {
1621                 /*
1622                  * setup for corking
1623                  */
1624                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1625                                      ipc6, rt, fl6);
1626                 if (err)
1627                         return err;
1628
1629                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1630                 length += exthdrlen;
1631                 transhdrlen += exthdrlen;
1632         } else {
1633                 fl6 = &inet->cork.fl.u.ip6;
1634                 transhdrlen = 0;
1635         }
1636
1637         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1638                                  &np->cork, sk_page_frag(sk), getfrag,
1639                                  from, length, transhdrlen, flags, ipc6, sockc);
1640 }
1641 EXPORT_SYMBOL_GPL(ip6_append_data);
1642
1643 static void ip6_cork_release(struct inet_cork_full *cork,
1644                              struct inet6_cork *v6_cork)
1645 {
1646         if (v6_cork->opt) {
1647                 kfree(v6_cork->opt->dst0opt);
1648                 kfree(v6_cork->opt->dst1opt);
1649                 kfree(v6_cork->opt->hopopt);
1650                 kfree(v6_cork->opt->srcrt);
1651                 kfree(v6_cork->opt);
1652                 v6_cork->opt = NULL;
1653         }
1654
1655         if (cork->base.dst) {
1656                 dst_release(cork->base.dst);
1657                 cork->base.dst = NULL;
1658                 cork->base.flags &= ~IPCORK_ALLFRAG;
1659         }
1660         memset(&cork->fl, 0, sizeof(cork->fl));
1661 }
1662
1663 struct sk_buff *__ip6_make_skb(struct sock *sk,
1664                                struct sk_buff_head *queue,
1665                                struct inet_cork_full *cork,
1666                                struct inet6_cork *v6_cork)
1667 {
1668         struct sk_buff *skb, *tmp_skb;
1669         struct sk_buff **tail_skb;
1670         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1671         struct ipv6_pinfo *np = inet6_sk(sk);
1672         struct net *net = sock_net(sk);
1673         struct ipv6hdr *hdr;
1674         struct ipv6_txoptions *opt = v6_cork->opt;
1675         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1676         struct flowi6 *fl6 = &cork->fl.u.ip6;
1677         unsigned char proto = fl6->flowi6_proto;
1678
1679         skb = __skb_dequeue(queue);
1680         if (!skb)
1681                 goto out;
1682         tail_skb = &(skb_shinfo(skb)->frag_list);
1683
1684         /* move skb->data to ip header from ext header */
1685         if (skb->data < skb_network_header(skb))
1686                 __skb_pull(skb, skb_network_offset(skb));
1687         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1688                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1689                 *tail_skb = tmp_skb;
1690                 tail_skb = &(tmp_skb->next);
1691                 skb->len += tmp_skb->len;
1692                 skb->data_len += tmp_skb->len;
1693                 skb->truesize += tmp_skb->truesize;
1694                 tmp_skb->destructor = NULL;
1695                 tmp_skb->sk = NULL;
1696         }
1697
1698         /* Allow local fragmentation. */
1699         skb->ignore_df = ip6_sk_ignore_df(sk);
1700
1701         *final_dst = fl6->daddr;
1702         __skb_pull(skb, skb_network_header_len(skb));
1703         if (opt && opt->opt_flen)
1704                 ipv6_push_frag_opts(skb, opt, &proto);
1705         if (opt && opt->opt_nflen)
1706                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1707
1708         skb_push(skb, sizeof(struct ipv6hdr));
1709         skb_reset_network_header(skb);
1710         hdr = ipv6_hdr(skb);
1711
1712         ip6_flow_hdr(hdr, v6_cork->tclass,
1713                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1714                                         np->autoflowlabel, fl6));
1715         hdr->hop_limit = v6_cork->hop_limit;
1716         hdr->nexthdr = proto;
1717         hdr->saddr = fl6->saddr;
1718         hdr->daddr = *final_dst;
1719
1720         skb->priority = sk->sk_priority;
1721         skb->mark = sk->sk_mark;
1722
1723         skb_dst_set(skb, dst_clone(&rt->dst));
1724         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1725         if (proto == IPPROTO_ICMPV6) {
1726                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1727
1728                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1729                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1730         }
1731
1732         ip6_cork_release(cork, v6_cork);
1733 out:
1734         return skb;
1735 }
1736
1737 int ip6_send_skb(struct sk_buff *skb)
1738 {
1739         struct net *net = sock_net(skb->sk);
1740         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1741         int err;
1742
1743         err = ip6_local_out(net, skb->sk, skb);
1744         if (err) {
1745                 if (err > 0)
1746                         err = net_xmit_errno(err);
1747                 if (err)
1748                         IP6_INC_STATS(net, rt->rt6i_idev,
1749                                       IPSTATS_MIB_OUTDISCARDS);
1750         }
1751
1752         return err;
1753 }
1754
1755 int ip6_push_pending_frames(struct sock *sk)
1756 {
1757         struct sk_buff *skb;
1758
1759         skb = ip6_finish_skb(sk);
1760         if (!skb)
1761                 return 0;
1762
1763         return ip6_send_skb(skb);
1764 }
1765 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1766
1767 static void __ip6_flush_pending_frames(struct sock *sk,
1768                                        struct sk_buff_head *queue,
1769                                        struct inet_cork_full *cork,
1770                                        struct inet6_cork *v6_cork)
1771 {
1772         struct sk_buff *skb;
1773
1774         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1775                 if (skb_dst(skb))
1776                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1777                                       IPSTATS_MIB_OUTDISCARDS);
1778                 kfree_skb(skb);
1779         }
1780
1781         ip6_cork_release(cork, v6_cork);
1782 }
1783
1784 void ip6_flush_pending_frames(struct sock *sk)
1785 {
1786         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1787                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1788 }
1789 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1790
1791 struct sk_buff *ip6_make_skb(struct sock *sk,
1792                              int getfrag(void *from, char *to, int offset,
1793                                          int len, int odd, struct sk_buff *skb),
1794                              void *from, int length, int transhdrlen,
1795                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1796                              struct rt6_info *rt, unsigned int flags,
1797                              const struct sockcm_cookie *sockc)
1798 {
1799         struct inet_cork_full cork;
1800         struct inet6_cork v6_cork;
1801         struct sk_buff_head queue;
1802         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1803         int err;
1804
1805         if (flags & MSG_PROBE)
1806                 return NULL;
1807
1808         __skb_queue_head_init(&queue);
1809
1810         cork.base.flags = 0;
1811         cork.base.addr = 0;
1812         cork.base.opt = NULL;
1813         v6_cork.opt = NULL;
1814         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1815         if (err)
1816                 return ERR_PTR(err);
1817
1818         if (ipc6->dontfrag < 0)
1819                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1820
1821         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1822                                 &current->task_frag, getfrag, from,
1823                                 length + exthdrlen, transhdrlen + exthdrlen,
1824                                 flags, ipc6, sockc);
1825         if (err) {
1826                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1827                 return ERR_PTR(err);
1828         }
1829
1830         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1831 }