2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
105 skb->protocol = htons(ETH_P_IPV6);
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case.
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit);
126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS);
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
139 neigh = dst_get_neighbour_noref(dst);
141 int res = neigh_output(neigh, skb);
147 IP6_INC_STATS_BH(dev_net(dst->dev),
148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
153 static int ip6_finish_output(struct sk_buff *skb)
155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 dst_allfrag(skb_dst(skb)))
157 return ip6_fragment(skb, ip6_finish_output2);
159 return ip6_finish_output2(skb);
162 int ip6_output(struct sk_buff *skb)
164 struct net_device *dev = skb_dst(skb)->dev;
165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 if (unlikely(idev->cnf.disable_ipv6)) {
167 IP6_INC_STATS(dev_net(dev), idev,
168 IPSTATS_MIB_OUTDISCARDS);
173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
175 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 * xmit an sk_buff (used by TCP, SCTP and DCCP)
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 struct ipv6_txoptions *opt, int tclass)
185 struct net *net = sock_net(sk);
186 struct ipv6_pinfo *np = inet6_sk(sk);
187 struct in6_addr *first_hop = &fl6->daddr;
188 struct dst_entry *dst = skb_dst(skb);
190 u8 proto = fl6->flowi6_proto;
191 int seg_len = skb->len;
196 unsigned int head_room;
198 /* First: exthdrs may take lots of space (~8K for now)
199 MAX_HEADER is not enough.
201 head_room = opt->opt_nflen + opt->opt_flen;
202 seg_len += head_room;
203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205 if (skb_headroom(skb) < head_room) {
206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 IPSTATS_MIB_OUTDISCARDS);
215 skb_set_owner_w(skb, sk);
218 ipv6_push_frag_opts(skb, opt, &proto);
220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
223 skb_push(skb, sizeof(struct ipv6hdr));
224 skb_reset_network_header(skb);
228 * Fill in the IPv6 header
231 hlimit = np->hop_limit;
233 hlimit = ip6_dst_hoplimit(dst);
235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
237 hdr->payload_len = htons(seg_len);
238 hdr->nexthdr = proto;
239 hdr->hop_limit = hlimit;
241 hdr->saddr = fl6->saddr;
242 hdr->daddr = *first_hop;
244 skb->priority = sk->sk_priority;
245 skb->mark = sk->sk_mark;
248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 IPSTATS_MIB_OUT, skb->len);
251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 dst->dev, dst_output);
255 net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
257 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
258 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
263 EXPORT_SYMBOL(ip6_xmit);
266 * To avoid extra problems ND packets are send through this
267 * routine. It's code duplication but I really want to avoid
268 * extra checks since ipv6_build_header is used by TCP (which
269 * is for us performance critical)
272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
273 const struct in6_addr *saddr, const struct in6_addr *daddr,
276 struct ipv6_pinfo *np = inet6_sk(sk);
279 skb->protocol = htons(ETH_P_IPV6);
282 skb_reset_network_header(skb);
283 skb_put(skb, sizeof(struct ipv6hdr));
286 *(__be32*)hdr = htonl(0x60000000);
288 hdr->payload_len = htons(len);
289 hdr->nexthdr = proto;
290 hdr->hop_limit = np->hop_limit;
298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 struct ip6_ra_chain *ra;
301 struct sock *last = NULL;
303 read_lock(&ip6_ra_lock);
304 for (ra = ip6_ra_chain; ra; ra = ra->next) {
305 struct sock *sk = ra->sk;
306 if (sk && ra->sel == sel &&
307 (!sk->sk_bound_dev_if ||
308 sk->sk_bound_dev_if == skb->dev->ifindex)) {
310 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312 rawv6_rcv(last, skb2);
319 rawv6_rcv(last, skb);
320 read_unlock(&ip6_ra_lock);
323 read_unlock(&ip6_ra_lock);
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 struct ipv6hdr *hdr = ipv6_hdr(skb);
330 u8 nexthdr = hdr->nexthdr;
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
339 offset = sizeof(struct ipv6hdr);
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
379 static inline int ip6_forward_finish(struct sk_buff *skb)
381 return dst_output(skb);
384 int ip6_forward(struct sk_buff *skb)
386 struct dst_entry *dst = skb_dst(skb);
387 struct ipv6hdr *hdr = ipv6_hdr(skb);
388 struct inet6_skb_parm *opt = IP6CB(skb);
389 struct net *net = dev_net(dst->dev);
392 if (net->ipv6.devconf_all->forwarding == 0)
395 if (skb_warn_if_lro(skb))
398 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
403 if (skb->pkt_type != PACKET_HOST)
406 skb_forward_csum(skb);
409 * We DO NOT make any processing on
410 * RA packets, pushing them to user level AS IS
411 * without ane WARRANTY that application will be able
412 * to interpret them. The reason is that we
413 * cannot make anything clever here.
415 * We are not end-node, so that if packet contains
416 * AH/ESP, we cannot make anything.
417 * Defragmentation also would be mistake, RA packets
418 * cannot be fragmented, because there is no warranty
419 * that different fragments will go along one path. --ANK
422 u8 *ptr = skb_network_header(skb) + opt->ra;
423 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
428 * check and decrement ttl
430 if (hdr->hop_limit <= 1) {
431 /* Force OUTPUT device used as source address */
433 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
434 IP6_INC_STATS_BH(net,
435 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
441 /* XXX: idev->cnf.proxy_ndp? */
442 if (net->ipv6.devconf_all->proxy_ndp &&
443 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
444 int proxied = ip6_forward_proxy_check(skb);
446 return ip6_input(skb);
447 else if (proxied < 0) {
448 IP6_INC_STATS(net, ip6_dst_idev(dst),
449 IPSTATS_MIB_INDISCARDS);
454 if (!xfrm6_route_forward(skb)) {
455 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
460 /* IPv6 specs say nothing about it, but it is clear that we cannot
461 send redirects to source routed frames.
462 We don't send redirects to frames decapsulated from IPsec.
464 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 struct in6_addr *target = NULL;
469 * incoming and outgoing devices are the same
473 rt = (struct rt6_info *) dst;
474 if (rt->rt6i_flags & RTF_GATEWAY)
475 target = &rt->rt6i_gateway;
477 target = &hdr->daddr;
480 rt6_bind_peer(rt, 1);
482 /* Limit redirects both by destination (here)
483 and by source (inside ndisc_send_redirect)
485 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 ndisc_send_redirect(skb, target);
488 int addrtype = ipv6_addr_type(&hdr->saddr);
490 /* This check is security critical. */
491 if (addrtype == IPV6_ADDR_ANY ||
492 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
494 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 ICMPV6_NOT_NEIGHBOUR, 0);
502 if (mtu < IPV6_MIN_MTU)
505 if (skb->len > mtu && !skb_is_gso(skb)) {
506 /* Again, force OUTPUT device used as source address */
508 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 IP6_INC_STATS_BH(net,
510 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
517 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 /* Mangling hops number delayed to point after skb COW */
528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
533 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 to->pkt_type = from->pkt_type;
542 to->priority = from->priority;
543 to->protocol = from->protocol;
545 skb_dst_set(to, dst_clone(skb_dst(from)));
547 to->mark = from->mark;
549 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index;
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace;
557 skb_copy_secmark(to, from);
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr;
569 while (offset + 1 <= packet_len) {
575 case NEXTHDR_ROUTING:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
590 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
601 static atomic_t ipv6_fragmentation_id;
604 if (rt && !(rt->dst.flags & DST_NOPEER)) {
605 struct inet_peer *peer;
608 rt6_bind_peer(rt, 1);
609 peer = rt->rt6i_peer;
611 fhdr->identification = htonl(inet_getid(peer, 0));
616 old = atomic_read(&ipv6_fragmentation_id);
620 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621 fhdr->identification = htonl(new);
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
626 struct sk_buff *frag;
627 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 struct ipv6hdr *tmp_hdr;
631 unsigned int mtu, hlen, left, len;
634 int ptr, offset = 0, err=0;
635 u8 *prevhdr, nexthdr = 0;
636 struct net *net = dev_net(skb_dst(skb)->dev);
638 hlen = ip6_find_1stfragopt(skb, &prevhdr);
641 mtu = ip6_skb_dst_mtu(skb);
643 /* We must not fragment if the socket is set to force MTU discovery
644 * or if the skb it not generated by a local socket.
646 if (unlikely(!skb->local_df && skb->len > mtu)) {
647 if (skb->sk && dst_allfrag(skb_dst(skb)))
648 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
650 skb->dev = skb_dst(skb)->dev;
651 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653 IPSTATS_MIB_FRAGFAILS);
658 if (np && np->frag_size < mtu) {
662 mtu -= hlen + sizeof(struct frag_hdr);
664 if (skb_has_frag_list(skb)) {
665 int first_len = skb_pagelen(skb);
666 struct sk_buff *frag2;
668 if (first_len - hlen > mtu ||
669 ((first_len - hlen) & 7) ||
673 skb_walk_frags(skb, frag) {
674 /* Correct geometry. */
675 if (frag->len > mtu ||
676 ((frag->len & 7) && frag->next) ||
677 skb_headroom(frag) < hlen)
678 goto slow_path_clean;
680 /* Partially cloned skb? */
681 if (skb_shared(frag))
682 goto slow_path_clean;
687 frag->destructor = sock_wfree;
689 skb->truesize -= frag->truesize;
694 frag = skb_shinfo(skb)->frag_list;
695 skb_frag_list_init(skb);
698 *prevhdr = NEXTHDR_FRAGMENT;
699 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
701 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702 IPSTATS_MIB_FRAGFAILS);
706 __skb_pull(skb, hlen);
707 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708 __skb_push(skb, hlen);
709 skb_reset_network_header(skb);
710 memcpy(skb_network_header(skb), tmp_hdr, hlen);
712 ipv6_select_ident(fh, rt);
713 fh->nexthdr = nexthdr;
715 fh->frag_off = htons(IP6_MF);
716 frag_id = fh->identification;
718 first_len = skb_pagelen(skb);
719 skb->data_len = first_len - skb_headlen(skb);
720 skb->len = first_len;
721 ipv6_hdr(skb)->payload_len = htons(first_len -
722 sizeof(struct ipv6hdr));
727 /* Prepare header of the next frame,
728 * before previous one went down. */
730 frag->ip_summed = CHECKSUM_NONE;
731 skb_reset_transport_header(frag);
732 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733 __skb_push(frag, hlen);
734 skb_reset_network_header(frag);
735 memcpy(skb_network_header(frag), tmp_hdr,
737 offset += skb->len - hlen - sizeof(struct frag_hdr);
738 fh->nexthdr = nexthdr;
740 fh->frag_off = htons(offset);
741 if (frag->next != NULL)
742 fh->frag_off |= htons(IP6_MF);
743 fh->identification = frag_id;
744 ipv6_hdr(frag)->payload_len =
746 sizeof(struct ipv6hdr));
747 ip6_copy_metadata(frag, skb);
752 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753 IPSTATS_MIB_FRAGCREATES);
766 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767 IPSTATS_MIB_FRAGOKS);
768 dst_release(&rt->dst);
778 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779 IPSTATS_MIB_FRAGFAILS);
780 dst_release(&rt->dst);
784 skb_walk_frags(skb, frag2) {
788 frag2->destructor = NULL;
789 skb->truesize += frag2->truesize;
794 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
795 skb_checksum_help(skb))
798 left = skb->len - hlen; /* Space per frame */
799 ptr = hlen; /* Where to start from */
802 * Fragment the datagram.
805 *prevhdr = NEXTHDR_FRAGMENT;
806 hroom = LL_RESERVED_SPACE(rt->dst.dev);
807 troom = rt->dst.dev->needed_tailroom;
810 * Keep copying data until we run out.
814 /* IF: it doesn't fit, use 'mtu' - the data space left */
817 /* IF: we are not sending up to and including the packet end
818 then align the next start on an eight byte boundary */
826 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
827 hroom + troom, GFP_ATOMIC)) == NULL) {
828 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
829 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830 IPSTATS_MIB_FRAGFAILS);
836 * Set up data on packet
839 ip6_copy_metadata(frag, skb);
840 skb_reserve(frag, hroom);
841 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
842 skb_reset_network_header(frag);
843 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
844 frag->transport_header = (frag->network_header + hlen +
845 sizeof(struct frag_hdr));
848 * Charge the memory for the fragment to any owner
852 skb_set_owner_w(frag, skb->sk);
855 * Copy the packet header into the new buffer.
857 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
860 * Build fragment header.
862 fh->nexthdr = nexthdr;
865 ipv6_select_ident(fh, rt);
866 frag_id = fh->identification;
868 fh->identification = frag_id;
871 * Copy a block of the IP datagram.
873 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
877 fh->frag_off = htons(offset);
879 fh->frag_off |= htons(IP6_MF);
880 ipv6_hdr(frag)->payload_len = htons(frag->len -
881 sizeof(struct ipv6hdr));
887 * Put this fragment into the sending queue.
893 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 IPSTATS_MIB_FRAGCREATES);
896 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897 IPSTATS_MIB_FRAGOKS);
902 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
903 IPSTATS_MIB_FRAGFAILS);
908 static inline int ip6_rt_check(const struct rt6key *rt_key,
909 const struct in6_addr *fl_addr,
910 const struct in6_addr *addr_cache)
912 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
913 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
916 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
917 struct dst_entry *dst,
918 const struct flowi6 *fl6)
920 struct ipv6_pinfo *np = inet6_sk(sk);
921 struct rt6_info *rt = (struct rt6_info *)dst;
926 /* Yes, checking route validity in not connected
927 * case is not very simple. Take into account,
928 * that we do not support routing by source, TOS,
929 * and MSG_DONTROUTE --ANK (980726)
931 * 1. ip6_rt_check(): If route was host route,
932 * check that cached destination is current.
933 * If it is network route, we still may
934 * check its validity using saved pointer
935 * to the last used address: daddr_cache.
936 * We do not want to save whole address now,
937 * (because main consumer of this service
938 * is tcp, which has not this problem),
939 * so that the last trick works only on connected
941 * 2. oif also should be the same.
943 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
944 #ifdef CONFIG_IPV6_SUBTREES
945 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
947 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
956 static int ip6_dst_lookup_tail(struct sock *sk,
957 struct dst_entry **dst, struct flowi6 *fl6)
959 struct net *net = sock_net(sk);
960 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
966 *dst = ip6_route_output(net, sk, fl6);
968 if ((err = (*dst)->error))
969 goto out_err_release;
971 if (ipv6_addr_any(&fl6->saddr)) {
972 struct rt6_info *rt = (struct rt6_info *) *dst;
973 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
974 sk ? inet6_sk(sk)->srcprefs : 0,
977 goto out_err_release;
980 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
982 * Here if the dst entry we've looked up
983 * has a neighbour entry that is in the INCOMPLETE
984 * state and the src address from the flow is
985 * marked as OPTIMISTIC, we release the found
986 * dst entry and replace it instead with the
987 * dst entry of the nexthop router
990 n = dst_get_neighbour_noref(*dst);
991 if (n && !(n->nud_state & NUD_VALID)) {
992 struct inet6_ifaddr *ifp;
993 struct flowi6 fl_gw6;
997 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1000 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006 * We need to get the dst entry for the
1007 * default router instead
1010 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1011 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1012 *dst = ip6_route_output(net, sk, &fl_gw6);
1013 if ((err = (*dst)->error))
1014 goto out_err_release;
1024 if (err == -ENETUNREACH)
1025 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1032 * ip6_dst_lookup - perform route lookup on flow
1033 * @sk: socket which provides route info
1034 * @dst: pointer to dst_entry * for result
1035 * @fl6: flow to lookup
1037 * This function performs a route lookup on the given flow.
1039 * It returns zero on success, or a standard errno code on error.
1041 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1044 return ip6_dst_lookup_tail(sk, dst, fl6);
1046 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1049 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1050 * @sk: socket which provides route info
1051 * @fl6: flow to lookup
1052 * @final_dst: final destination address for ipsec lookup
1053 * @can_sleep: we are in a sleepable context
1055 * This function performs a route lookup on the given flow.
1057 * It returns a valid dst pointer on success, or a pointer encoded
1060 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1061 const struct in6_addr *final_dst,
1064 struct dst_entry *dst = NULL;
1067 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1069 return ERR_PTR(err);
1071 fl6->daddr = *final_dst;
1073 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1075 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1077 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1080 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1081 * @sk: socket which provides the dst cache and route info
1082 * @fl6: flow to lookup
1083 * @final_dst: final destination address for ipsec lookup
1084 * @can_sleep: we are in a sleepable context
1086 * This function performs a route lookup on the given flow with the
1087 * possibility of using the cached route in the socket if it is valid.
1088 * It will take the socket dst lock when operating on the dst cache.
1089 * As a result, this function can only be used in process context.
1091 * It returns a valid dst pointer on success, or a pointer encoded
1094 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1095 const struct in6_addr *final_dst,
1098 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1101 dst = ip6_sk_dst_check(sk, dst, fl6);
1103 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1105 return ERR_PTR(err);
1107 fl6->daddr = *final_dst;
1109 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1111 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1113 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1115 static inline int ip6_ufo_append_data(struct sock *sk,
1116 int getfrag(void *from, char *to, int offset, int len,
1117 int odd, struct sk_buff *skb),
1118 void *from, int length, int hh_len, int fragheaderlen,
1119 int transhdrlen, int mtu,unsigned int flags,
1120 struct rt6_info *rt)
1123 struct sk_buff *skb;
1126 /* There is support for UDP large send offload by network
1127 * device, so create one single skb packet containing complete
1130 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1131 skb = sock_alloc_send_skb(sk,
1132 hh_len + fragheaderlen + transhdrlen + 20,
1133 (flags & MSG_DONTWAIT), &err);
1137 /* reserve space for Hardware header */
1138 skb_reserve(skb, hh_len);
1140 /* create space for UDP/IP header */
1141 skb_put(skb,fragheaderlen + transhdrlen);
1143 /* initialize network header pointer */
1144 skb_reset_network_header(skb);
1146 /* initialize protocol header pointer */
1147 skb->transport_header = skb->network_header + fragheaderlen;
1149 skb->ip_summed = CHECKSUM_PARTIAL;
1153 err = skb_append_datato_frags(sk,skb, getfrag, from,
1154 (length - transhdrlen));
1156 struct frag_hdr fhdr;
1158 /* Specify the length of each IPv6 datagram fragment.
1159 * It has to be a multiple of 8.
1161 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1162 sizeof(struct frag_hdr)) & ~7;
1163 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1164 ipv6_select_ident(&fhdr, rt);
1165 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1166 __skb_queue_tail(&sk->sk_write_queue, skb);
1170 /* There is not enough support do UPD LSO,
1171 * so follow normal path
1178 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1181 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1184 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1187 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1190 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1191 int offset, int len, int odd, struct sk_buff *skb),
1192 void *from, int length, int transhdrlen,
1193 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1194 struct rt6_info *rt, unsigned int flags, int dontfrag)
1196 struct inet_sock *inet = inet_sk(sk);
1197 struct ipv6_pinfo *np = inet6_sk(sk);
1198 struct inet_cork *cork;
1199 struct sk_buff *skb;
1200 unsigned int maxfraglen, fragheaderlen;
1210 if (flags&MSG_PROBE)
1212 cork = &inet->cork.base;
1213 if (skb_queue_empty(&sk->sk_write_queue)) {
1218 if (WARN_ON(np->cork.opt))
1221 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1222 if (unlikely(np->cork.opt == NULL))
1225 np->cork.opt->tot_len = opt->tot_len;
1226 np->cork.opt->opt_flen = opt->opt_flen;
1227 np->cork.opt->opt_nflen = opt->opt_nflen;
1229 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1231 if (opt->dst0opt && !np->cork.opt->dst0opt)
1234 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1236 if (opt->dst1opt && !np->cork.opt->dst1opt)
1239 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1241 if (opt->hopopt && !np->cork.opt->hopopt)
1244 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1246 if (opt->srcrt && !np->cork.opt->srcrt)
1249 /* need source address above miyazawa*/
1252 cork->dst = &rt->dst;
1253 inet->cork.fl.u.ip6 = *fl6;
1254 np->cork.hop_limit = hlimit;
1255 np->cork.tclass = tclass;
1256 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1257 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1258 if (np->frag_size < mtu) {
1260 mtu = np->frag_size;
1262 cork->fragsize = mtu;
1263 if (dst_allfrag(rt->dst.path))
1264 cork->flags |= IPCORK_ALLFRAG;
1266 sk->sk_sndmsg_page = NULL;
1267 sk->sk_sndmsg_off = 0;
1268 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1269 length += exthdrlen;
1270 transhdrlen += exthdrlen;
1271 dst_exthdrlen = rt->dst.header_len;
1273 rt = (struct rt6_info *)cork->dst;
1274 fl6 = &inet->cork.fl.u.ip6;
1279 mtu = cork->fragsize;
1282 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1284 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1285 (opt ? opt->opt_nflen : 0);
1286 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1288 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1289 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1290 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1295 /* For UDP, check if TX timestamp is enabled */
1296 if (sk->sk_type == SOCK_DGRAM) {
1297 err = sock_tx_timestamp(sk, &tx_flags);
1303 * Let's try using as much space as possible.
1304 * Use MTU if total length of the message fits into the MTU.
1305 * Otherwise, we need to reserve fragment header and
1306 * fragment alignment (= 8-15 octects, in total).
1308 * Note that we may need to "move" the data from the tail of
1309 * of the buffer to the new fragment when we split
1312 * FIXME: It may be fragmented into multiple chunks
1313 * at once if non-fragmentable extension headers
1318 cork->length += length;
1320 int proto = sk->sk_protocol;
1321 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1322 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1326 if (proto == IPPROTO_UDP &&
1327 (rt->dst.dev->features & NETIF_F_UFO)) {
1329 err = ip6_ufo_append_data(sk, getfrag, from, length,
1330 hh_len, fragheaderlen,
1331 transhdrlen, mtu, flags, rt);
1338 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1341 while (length > 0) {
1342 /* Check if the remaining data fits into current packet. */
1343 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1345 copy = maxfraglen - skb->len;
1349 unsigned int datalen;
1350 unsigned int fraglen;
1351 unsigned int fraggap;
1352 unsigned int alloclen;
1353 struct sk_buff *skb_prev;
1357 /* There's no room in the current skb */
1359 fraggap = skb_prev->len - maxfraglen;
1364 * If remaining data exceeds the mtu,
1365 * we know we need more fragment(s).
1367 datalen = length + fraggap;
1368 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1369 datalen = maxfraglen - fragheaderlen;
1371 fraglen = datalen + fragheaderlen;
1372 if ((flags & MSG_MORE) &&
1373 !(rt->dst.dev->features&NETIF_F_SG))
1376 alloclen = datalen + fragheaderlen;
1378 alloclen += dst_exthdrlen;
1381 * The last fragment gets additional space at tail.
1382 * Note: we overallocate on fragments with MSG_MODE
1383 * because we have no idea if we're the last one.
1385 if (datalen == length + fraggap)
1386 alloclen += rt->dst.trailer_len;
1389 * We just reserve space for fragment header.
1390 * Note: this may be overallocation if the message
1391 * (without MSG_MORE) fits into the MTU.
1393 alloclen += sizeof(struct frag_hdr);
1396 skb = sock_alloc_send_skb(sk,
1398 (flags & MSG_DONTWAIT), &err);
1401 if (atomic_read(&sk->sk_wmem_alloc) <=
1403 skb = sock_wmalloc(sk,
1404 alloclen + hh_len, 1,
1406 if (unlikely(skb == NULL))
1409 /* Only the initial fragment
1418 * Fill in the control structures
1420 skb->ip_summed = CHECKSUM_NONE;
1422 /* reserve for fragmentation and ipsec header */
1423 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1426 if (sk->sk_type == SOCK_DGRAM)
1427 skb_shinfo(skb)->tx_flags = tx_flags;
1430 * Find where to start putting bytes
1432 data = skb_put(skb, fraglen);
1433 skb_set_network_header(skb, exthdrlen);
1434 data += fragheaderlen;
1435 skb->transport_header = (skb->network_header +
1438 skb->csum = skb_copy_and_csum_bits(
1439 skb_prev, maxfraglen,
1440 data + transhdrlen, fraggap, 0);
1441 skb_prev->csum = csum_sub(skb_prev->csum,
1444 pskb_trim_unique(skb_prev, maxfraglen);
1446 copy = datalen - transhdrlen - fraggap;
1452 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1459 length -= datalen - fraggap;
1465 * Put the packet on the pending queue
1467 __skb_queue_tail(&sk->sk_write_queue, skb);
1474 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1478 if (getfrag(from, skb_put(skb, copy),
1479 offset, copy, off, skb) < 0) {
1480 __skb_trim(skb, off);
1485 int i = skb_shinfo(skb)->nr_frags;
1486 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1487 struct page *page = sk->sk_sndmsg_page;
1488 int off = sk->sk_sndmsg_off;
1491 if (page && (left = PAGE_SIZE - off) > 0) {
1494 if (page != skb_frag_page(frag)) {
1495 if (i == MAX_SKB_FRAGS) {
1499 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1500 skb_frag_ref(skb, i);
1501 frag = &skb_shinfo(skb)->frags[i];
1503 } else if(i < MAX_SKB_FRAGS) {
1504 if (copy > PAGE_SIZE)
1506 page = alloc_pages(sk->sk_allocation, 0);
1511 sk->sk_sndmsg_page = page;
1512 sk->sk_sndmsg_off = 0;
1514 skb_fill_page_desc(skb, i, page, 0, 0);
1515 frag = &skb_shinfo(skb)->frags[i];
1521 skb_frag_address(frag) + skb_frag_size(frag),
1522 offset, copy, skb->len, skb) < 0) {
1526 sk->sk_sndmsg_off += copy;
1527 skb_frag_size_add(frag, copy);
1529 skb->data_len += copy;
1530 skb->truesize += copy;
1531 atomic_add(copy, &sk->sk_wmem_alloc);
1538 cork->length -= length;
1539 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1542 EXPORT_SYMBOL_GPL(ip6_append_data);
1544 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1547 kfree(np->cork.opt->dst0opt);
1548 kfree(np->cork.opt->dst1opt);
1549 kfree(np->cork.opt->hopopt);
1550 kfree(np->cork.opt->srcrt);
1551 kfree(np->cork.opt);
1552 np->cork.opt = NULL;
1555 if (inet->cork.base.dst) {
1556 dst_release(inet->cork.base.dst);
1557 inet->cork.base.dst = NULL;
1558 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1560 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1563 int ip6_push_pending_frames(struct sock *sk)
1565 struct sk_buff *skb, *tmp_skb;
1566 struct sk_buff **tail_skb;
1567 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1568 struct inet_sock *inet = inet_sk(sk);
1569 struct ipv6_pinfo *np = inet6_sk(sk);
1570 struct net *net = sock_net(sk);
1571 struct ipv6hdr *hdr;
1572 struct ipv6_txoptions *opt = np->cork.opt;
1573 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1574 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1575 unsigned char proto = fl6->flowi6_proto;
1578 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1580 tail_skb = &(skb_shinfo(skb)->frag_list);
1582 /* move skb->data to ip header from ext header */
1583 if (skb->data < skb_network_header(skb))
1584 __skb_pull(skb, skb_network_offset(skb));
1585 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1586 __skb_pull(tmp_skb, skb_network_header_len(skb));
1587 *tail_skb = tmp_skb;
1588 tail_skb = &(tmp_skb->next);
1589 skb->len += tmp_skb->len;
1590 skb->data_len += tmp_skb->len;
1591 skb->truesize += tmp_skb->truesize;
1592 tmp_skb->destructor = NULL;
1596 /* Allow local fragmentation. */
1597 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1600 *final_dst = fl6->daddr;
1601 __skb_pull(skb, skb_network_header_len(skb));
1602 if (opt && opt->opt_flen)
1603 ipv6_push_frag_opts(skb, opt, &proto);
1604 if (opt && opt->opt_nflen)
1605 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1607 skb_push(skb, sizeof(struct ipv6hdr));
1608 skb_reset_network_header(skb);
1609 hdr = ipv6_hdr(skb);
1611 *(__be32*)hdr = fl6->flowlabel |
1612 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1614 hdr->hop_limit = np->cork.hop_limit;
1615 hdr->nexthdr = proto;
1616 hdr->saddr = fl6->saddr;
1617 hdr->daddr = *final_dst;
1619 skb->priority = sk->sk_priority;
1620 skb->mark = sk->sk_mark;
1622 skb_dst_set(skb, dst_clone(&rt->dst));
1623 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1624 if (proto == IPPROTO_ICMPV6) {
1625 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1627 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1628 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1631 err = ip6_local_out(skb);
1634 err = net_xmit_errno(err);
1640 ip6_cork_release(inet, np);
1643 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1646 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1648 void ip6_flush_pending_frames(struct sock *sk)
1650 struct sk_buff *skb;
1652 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1654 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1655 IPSTATS_MIB_OUTDISCARDS);
1659 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1661 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);