2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
10 * Based on linux/net/ipv4/ip_output.c
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
31 #include <linux/errno.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/socket.h>
35 #include <linux/net.h>
36 #include <linux/netdevice.h>
37 #include <linux/if_arp.h>
38 #include <linux/in6.h>
39 #include <linux/tcp.h>
40 #include <linux/route.h>
41 #include <linux/module.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
57 #include <net/checksum.h>
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
73 int __ip6_local_out(struct sk_buff *skb)
77 len = skb->len - sizeof(struct ipv6hdr);
78 if (len > IPV6_MAXPLEN)
80 ipv6_hdr(skb)->payload_len = htons(len);
82 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
86 int ip6_local_out(struct sk_buff *skb)
90 err = __ip6_local_out(skb);
92 err = dst_output(skb);
96 EXPORT_SYMBOL_GPL(ip6_local_out);
98 static int ip6_output_finish(struct sk_buff *skb)
100 struct dst_entry *dst = skb->dst;
103 return neigh_hh_output(dst->hh, skb);
104 else if (dst->neighbour)
105 return dst->neighbour->output(skb);
107 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
113 /* dev_loopback_xmit for use with netfilter. */
114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
116 skb_reset_mac_header(newskb);
117 __skb_pull(newskb, skb_network_offset(newskb));
118 newskb->pkt_type = PACKET_LOOPBACK;
119 newskb->ip_summed = CHECKSUM_UNNECESSARY;
120 BUG_TRAP(newskb->dst);
127 static int ip6_output2(struct sk_buff *skb)
129 struct dst_entry *dst = skb->dst;
130 struct net_device *dev = dst->dev;
132 skb->protocol = htons(ETH_P_IPV6);
135 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
136 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
137 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
139 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr)) {
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
148 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
150 ip6_dev_loopback_xmit);
152 if (ipv6_hdr(skb)->hop_limit == 0) {
153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
162 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
168 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
170 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 skb->dst->dev->mtu : dst_mtu(skb->dst);
174 int ip6_output(struct sk_buff *skb)
176 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177 dst_allfrag(skb->dst))
178 return ip6_fragment(skb, ip6_output2);
180 return ip6_output2(skb);
184 * xmit an sk_buff (used by TCP)
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 struct ipv6_txoptions *opt, int ipfragok)
190 struct ipv6_pinfo *np = inet6_sk(sk);
191 struct in6_addr *first_hop = &fl->fl6_dst;
192 struct dst_entry *dst = skb->dst;
194 u8 proto = fl->proto;
195 int seg_len = skb->len;
200 unsigned int head_room;
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
212 IP6_INC_STATS(ip6_dst_idev(skb->dst),
213 IPSTATS_MIB_OUTDISCARDS);
220 skb_set_owner_w(skb, sk);
223 ipv6_push_frag_opts(skb, opt, &proto);
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
233 * Fill in the IPv6 header
238 hlimit = np->hop_limit;
240 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
242 hlimit = ipv6_get_hoplimit(dst->dev);
250 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
252 hdr->payload_len = htons(seg_len);
253 hdr->nexthdr = proto;
254 hdr->hop_limit = hlimit;
256 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
257 ipv6_addr_copy(&hdr->daddr, first_hop);
259 skb->priority = sk->sk_priority;
260 skb->mark = sk->sk_mark;
263 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
264 IP6_INC_STATS(ip6_dst_idev(skb->dst),
265 IPSTATS_MIB_OUTREQUESTS);
266 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
271 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
273 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
274 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
279 EXPORT_SYMBOL(ip6_xmit);
282 * To avoid extra problems ND packets are send through this
283 * routine. It's code duplication but I really want to avoid
284 * extra checks since ipv6_build_header is used by TCP (which
285 * is for us performance critical)
288 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 struct in6_addr *saddr, struct in6_addr *daddr,
292 struct ipv6_pinfo *np = inet6_sk(sk);
296 skb->protocol = htons(ETH_P_IPV6);
299 totlen = len + sizeof(struct ipv6hdr);
301 skb_reset_network_header(skb);
302 skb_put(skb, sizeof(struct ipv6hdr));
305 *(__be32*)hdr = htonl(0x60000000);
307 hdr->payload_len = htons(len);
308 hdr->nexthdr = proto;
309 hdr->hop_limit = np->hop_limit;
311 ipv6_addr_copy(&hdr->saddr, saddr);
312 ipv6_addr_copy(&hdr->daddr, daddr);
317 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
319 struct ip6_ra_chain *ra;
320 struct sock *last = NULL;
322 read_lock(&ip6_ra_lock);
323 for (ra = ip6_ra_chain; ra; ra = ra->next) {
324 struct sock *sk = ra->sk;
325 if (sk && ra->sel == sel &&
326 (!sk->sk_bound_dev_if ||
327 sk->sk_bound_dev_if == skb->dev->ifindex)) {
329 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
331 rawv6_rcv(last, skb2);
338 rawv6_rcv(last, skb);
339 read_unlock(&ip6_ra_lock);
342 read_unlock(&ip6_ra_lock);
346 static int ip6_forward_proxy_check(struct sk_buff *skb)
348 struct ipv6hdr *hdr = ipv6_hdr(skb);
349 u8 nexthdr = hdr->nexthdr;
352 if (ipv6_ext_hdr(nexthdr)) {
353 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
357 offset = sizeof(struct ipv6hdr);
359 if (nexthdr == IPPROTO_ICMPV6) {
360 struct icmp6hdr *icmp6;
362 if (!pskb_may_pull(skb, (skb_network_header(skb) +
363 offset + 1 - skb->data)))
366 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
368 switch (icmp6->icmp6_type) {
369 case NDISC_ROUTER_SOLICITATION:
370 case NDISC_ROUTER_ADVERTISEMENT:
371 case NDISC_NEIGHBOUR_SOLICITATION:
372 case NDISC_NEIGHBOUR_ADVERTISEMENT:
374 /* For reaction involving unicast neighbor discovery
375 * message destined to the proxied address, pass it to
385 * The proxying router can't forward traffic sent to a link-local
386 * address, so signal the sender and discard the packet. This
387 * behavior is clarified by the MIPv6 specification.
389 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
390 dst_link_failure(skb);
397 static inline int ip6_forward_finish(struct sk_buff *skb)
399 return dst_output(skb);
402 int ip6_forward(struct sk_buff *skb)
404 struct dst_entry *dst = skb->dst;
405 struct ipv6hdr *hdr = ipv6_hdr(skb);
406 struct inet6_skb_parm *opt = IP6CB(skb);
408 if (ipv6_devconf.forwarding == 0)
411 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
412 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
416 skb_forward_csum(skb);
419 * We DO NOT make any processing on
420 * RA packets, pushing them to user level AS IS
421 * without ane WARRANTY that application will be able
422 * to interpret them. The reason is that we
423 * cannot make anything clever here.
425 * We are not end-node, so that if packet contains
426 * AH/ESP, we cannot make anything.
427 * Defragmentation also would be mistake, RA packets
428 * cannot be fragmented, because there is no warranty
429 * that different fragments will go along one path. --ANK
432 u8 *ptr = skb_network_header(skb) + opt->ra;
433 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
438 * check and decrement ttl
440 if (hdr->hop_limit <= 1) {
441 /* Force OUTPUT device used as source address */
443 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
445 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
451 /* XXX: idev->cnf.proxy_ndp? */
452 if (ipv6_devconf.proxy_ndp &&
453 pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) {
454 int proxied = ip6_forward_proxy_check(skb);
456 return ip6_input(skb);
457 else if (proxied < 0) {
458 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
463 if (!xfrm6_route_forward(skb)) {
464 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
469 /* IPv6 specs say nothing about it, but it is clear that we cannot
470 send redirects to source routed frames.
471 We don't send redirects to frames decapsulated from IPsec.
473 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
475 struct in6_addr *target = NULL;
477 struct neighbour *n = dst->neighbour;
480 * incoming and outgoing devices are the same
484 rt = (struct rt6_info *) dst;
485 if ((rt->rt6i_flags & RTF_GATEWAY))
486 target = (struct in6_addr*)&n->primary_key;
488 target = &hdr->daddr;
490 /* Limit redirects both by destination (here)
491 and by source (inside ndisc_send_redirect)
493 if (xrlim_allow(dst, 1*HZ))
494 ndisc_send_redirect(skb, n, target);
496 int addrtype = ipv6_addr_type(&hdr->saddr);
498 /* This check is security critical. */
499 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
501 if (addrtype & IPV6_ADDR_LINKLOCAL) {
502 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
503 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
508 if (skb->len > dst_mtu(dst)) {
509 /* Again, force OUTPUT device used as source address */
511 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
512 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
518 if (skb_cow(skb, dst->dev->hard_header_len)) {
519 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
525 /* Mangling hops number delayed to point after skb COW */
529 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
530 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
534 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
540 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 to->pkt_type = from->pkt_type;
543 to->priority = from->priority;
544 to->protocol = from->protocol;
545 dst_release(to->dst);
546 to->dst = dst_clone(from->dst);
548 to->mark = from->mark;
550 #ifdef CONFIG_NET_SCHED
551 to->tc_index = from->tc_index;
554 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
555 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
556 to->nf_trace = from->nf_trace;
558 skb_copy_secmark(to, from);
561 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 u16 offset = sizeof(struct ipv6hdr);
564 struct ipv6_opt_hdr *exthdr =
565 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
566 unsigned int packet_len = skb->tail - skb->network_header;
568 *nexthdr = &ipv6_hdr(skb)->nexthdr;
570 while (offset + 1 <= packet_len) {
576 case NEXTHDR_ROUTING:
580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
581 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
591 offset += ipv6_optlen(exthdr);
592 *nexthdr = &exthdr->nexthdr;
593 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
600 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
602 struct net_device *dev;
603 struct sk_buff *frag;
604 struct rt6_info *rt = (struct rt6_info*)skb->dst;
605 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
606 struct ipv6hdr *tmp_hdr;
608 unsigned int mtu, hlen, left, len;
610 int ptr, offset = 0, err=0;
611 u8 *prevhdr, nexthdr = 0;
614 hlen = ip6_find_1stfragopt(skb, &prevhdr);
617 mtu = ip6_skb_dst_mtu(skb);
619 /* We must not fragment if the socket is set to force MTU discovery
620 * or if the skb it not generated by a local socket. (This last
621 * check should be redundant, but it's free.)
623 if (!skb->local_df) {
624 skb->dev = skb->dst->dev;
625 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
626 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
631 if (np && np->frag_size < mtu) {
635 mtu -= hlen + sizeof(struct frag_hdr);
637 if (skb_shinfo(skb)->frag_list) {
638 int first_len = skb_pagelen(skb);
641 if (first_len - hlen > mtu ||
642 ((first_len - hlen) & 7) ||
646 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
647 /* Correct geometry. */
648 if (frag->len > mtu ||
649 ((frag->len & 7) && frag->next) ||
650 skb_headroom(frag) < hlen)
653 /* Partially cloned skb? */
654 if (skb_shared(frag))
661 frag->destructor = sock_wfree;
662 truesizes += frag->truesize;
668 frag = skb_shinfo(skb)->frag_list;
669 skb_shinfo(skb)->frag_list = NULL;
672 *prevhdr = NEXTHDR_FRAGMENT;
673 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
675 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
679 __skb_pull(skb, hlen);
680 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
681 __skb_push(skb, hlen);
682 skb_reset_network_header(skb);
683 memcpy(skb_network_header(skb), tmp_hdr, hlen);
685 ipv6_select_ident(skb, fh);
686 fh->nexthdr = nexthdr;
688 fh->frag_off = htons(IP6_MF);
689 frag_id = fh->identification;
691 first_len = skb_pagelen(skb);
692 skb->data_len = first_len - skb_headlen(skb);
693 skb->truesize -= truesizes;
694 skb->len = first_len;
695 ipv6_hdr(skb)->payload_len = htons(first_len -
696 sizeof(struct ipv6hdr));
698 dst_hold(&rt->u.dst);
701 /* Prepare header of the next frame,
702 * before previous one went down. */
704 frag->ip_summed = CHECKSUM_NONE;
705 skb_reset_transport_header(frag);
706 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
707 __skb_push(frag, hlen);
708 skb_reset_network_header(frag);
709 memcpy(skb_network_header(frag), tmp_hdr,
711 offset += skb->len - hlen - sizeof(struct frag_hdr);
712 fh->nexthdr = nexthdr;
714 fh->frag_off = htons(offset);
715 if (frag->next != NULL)
716 fh->frag_off |= htons(IP6_MF);
717 fh->identification = frag_id;
718 ipv6_hdr(frag)->payload_len =
720 sizeof(struct ipv6hdr));
721 ip6_copy_metadata(frag, skb);
726 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
739 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
740 dst_release(&rt->u.dst);
750 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
751 dst_release(&rt->u.dst);
756 left = skb->len - hlen; /* Space per frame */
757 ptr = hlen; /* Where to start from */
760 * Fragment the datagram.
763 *prevhdr = NEXTHDR_FRAGMENT;
766 * Keep copying data until we run out.
770 /* IF: it doesn't fit, use 'mtu' - the data space left */
773 /* IF: we are not sending upto and including the packet end
774 then align the next start on an eight byte boundary */
782 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
783 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
784 IP6_INC_STATS(ip6_dst_idev(skb->dst),
785 IPSTATS_MIB_FRAGFAILS);
791 * Set up data on packet
794 ip6_copy_metadata(frag, skb);
795 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
796 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
797 skb_reset_network_header(frag);
798 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
799 frag->transport_header = (frag->network_header + hlen +
800 sizeof(struct frag_hdr));
803 * Charge the memory for the fragment to any owner
807 skb_set_owner_w(frag, skb->sk);
810 * Copy the packet header into the new buffer.
812 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
815 * Build fragment header.
817 fh->nexthdr = nexthdr;
820 ipv6_select_ident(skb, fh);
821 frag_id = fh->identification;
823 fh->identification = frag_id;
826 * Copy a block of the IP datagram.
828 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
832 fh->frag_off = htons(offset);
834 fh->frag_off |= htons(IP6_MF);
835 ipv6_hdr(frag)->payload_len = htons(frag->len -
836 sizeof(struct ipv6hdr));
842 * Put this fragment into the sending queue.
848 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
850 IP6_INC_STATS(ip6_dst_idev(skb->dst),
851 IPSTATS_MIB_FRAGOKS);
856 IP6_INC_STATS(ip6_dst_idev(skb->dst),
857 IPSTATS_MIB_FRAGFAILS);
862 static inline int ip6_rt_check(struct rt6key *rt_key,
863 struct in6_addr *fl_addr,
864 struct in6_addr *addr_cache)
866 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
867 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
870 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
871 struct dst_entry *dst,
874 struct ipv6_pinfo *np = inet6_sk(sk);
875 struct rt6_info *rt = (struct rt6_info *)dst;
880 /* Yes, checking route validity in not connected
881 * case is not very simple. Take into account,
882 * that we do not support routing by source, TOS,
883 * and MSG_DONTROUTE --ANK (980726)
885 * 1. ip6_rt_check(): If route was host route,
886 * check that cached destination is current.
887 * If it is network route, we still may
888 * check its validity using saved pointer
889 * to the last used address: daddr_cache.
890 * We do not want to save whole address now,
891 * (because main consumer of this service
892 * is tcp, which has not this problem),
893 * so that the last trick works only on connected
895 * 2. oif also should be the same.
897 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
898 #ifdef CONFIG_IPV6_SUBTREES
899 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
901 (fl->oif && fl->oif != dst->dev->ifindex)) {
910 static int ip6_dst_lookup_tail(struct sock *sk,
911 struct dst_entry **dst, struct flowi *fl)
916 *dst = ip6_route_output(sk, fl);
918 if ((err = (*dst)->error))
919 goto out_err_release;
921 if (ipv6_addr_any(&fl->fl6_src)) {
922 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
924 goto out_err_release;
927 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
929 * Here if the dst entry we've looked up
930 * has a neighbour entry that is in the INCOMPLETE
931 * state and the src address from the flow is
932 * marked as OPTIMISTIC, we release the found
933 * dst entry and replace it instead with the
934 * dst entry of the nexthop router
936 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
937 struct inet6_ifaddr *ifp;
941 ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src,
944 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
950 * We need to get the dst entry for the
951 * default router instead
954 memcpy(&fl_gw, fl, sizeof(struct flowi));
955 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
956 *dst = ip6_route_output(sk, &fl_gw);
957 if ((err = (*dst)->error))
958 goto out_err_release;
966 if (err == -ENETUNREACH)
967 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
974 * ip6_dst_lookup - perform route lookup on flow
975 * @sk: socket which provides route info
976 * @dst: pointer to dst_entry * for result
977 * @fl: flow to lookup
979 * This function performs a route lookup on the given flow.
981 * It returns zero on success, or a standard errno code on error.
983 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
986 return ip6_dst_lookup_tail(sk, dst, fl);
988 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
991 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
992 * @sk: socket which provides the dst cache and route info
993 * @dst: pointer to dst_entry * for result
994 * @fl: flow to lookup
996 * This function performs a route lookup on the given flow with the
997 * possibility of using the cached route in the socket if it is valid.
998 * It will take the socket dst lock when operating on the dst cache.
999 * As a result, this function can only be used in process context.
1001 * It returns zero on success, or a standard errno code on error.
1003 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1007 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1008 *dst = ip6_sk_dst_check(sk, *dst, fl);
1011 return ip6_dst_lookup_tail(sk, dst, fl);
1013 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1015 static inline int ip6_ufo_append_data(struct sock *sk,
1016 int getfrag(void *from, char *to, int offset, int len,
1017 int odd, struct sk_buff *skb),
1018 void *from, int length, int hh_len, int fragheaderlen,
1019 int transhdrlen, int mtu,unsigned int flags)
1022 struct sk_buff *skb;
1025 /* There is support for UDP large send offload by network
1026 * device, so create one single skb packet containing complete
1029 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1030 skb = sock_alloc_send_skb(sk,
1031 hh_len + fragheaderlen + transhdrlen + 20,
1032 (flags & MSG_DONTWAIT), &err);
1036 /* reserve space for Hardware header */
1037 skb_reserve(skb, hh_len);
1039 /* create space for UDP/IP header */
1040 skb_put(skb,fragheaderlen + transhdrlen);
1042 /* initialize network header pointer */
1043 skb_reset_network_header(skb);
1045 /* initialize protocol header pointer */
1046 skb->transport_header = skb->network_header + fragheaderlen;
1048 skb->ip_summed = CHECKSUM_PARTIAL;
1050 sk->sk_sndmsg_off = 0;
1053 err = skb_append_datato_frags(sk,skb, getfrag, from,
1054 (length - transhdrlen));
1056 struct frag_hdr fhdr;
1058 /* specify the length of each IP datagram fragment*/
1059 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1060 sizeof(struct frag_hdr);
1061 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1062 ipv6_select_ident(skb, &fhdr);
1063 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1064 __skb_queue_tail(&sk->sk_write_queue, skb);
1068 /* There is not enough support do UPD LSO,
1069 * so follow normal path
1076 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1077 int offset, int len, int odd, struct sk_buff *skb),
1078 void *from, int length, int transhdrlen,
1079 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1080 struct rt6_info *rt, unsigned int flags)
1082 struct inet_sock *inet = inet_sk(sk);
1083 struct ipv6_pinfo *np = inet6_sk(sk);
1084 struct sk_buff *skb;
1085 unsigned int maxfraglen, fragheaderlen;
1092 int csummode = CHECKSUM_NONE;
1094 if (flags&MSG_PROBE)
1096 if (skb_queue_empty(&sk->sk_write_queue)) {
1101 if (np->cork.opt == NULL) {
1102 np->cork.opt = kmalloc(opt->tot_len,
1104 if (unlikely(np->cork.opt == NULL))
1106 } else if (np->cork.opt->tot_len < opt->tot_len) {
1107 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1110 memcpy(np->cork.opt, opt, opt->tot_len);
1111 inet->cork.flags |= IPCORK_OPT;
1112 /* need source address above miyazawa*/
1114 dst_hold(&rt->u.dst);
1116 inet->cork.fl = *fl;
1117 np->cork.hop_limit = hlimit;
1118 np->cork.tclass = tclass;
1119 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1120 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1121 if (np->frag_size < mtu) {
1123 mtu = np->frag_size;
1125 inet->cork.fragsize = mtu;
1126 if (dst_allfrag(rt->u.dst.path))
1127 inet->cork.flags |= IPCORK_ALLFRAG;
1128 inet->cork.length = 0;
1129 sk->sk_sndmsg_page = NULL;
1130 sk->sk_sndmsg_off = 0;
1131 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1132 rt->rt6i_nfheader_len;
1133 length += exthdrlen;
1134 transhdrlen += exthdrlen;
1137 fl = &inet->cork.fl;
1138 if (inet->cork.flags & IPCORK_OPT)
1142 mtu = inet->cork.fragsize;
1145 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1147 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1148 (opt ? opt->opt_nflen : 0);
1149 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1151 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1152 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1153 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1159 * Let's try using as much space as possible.
1160 * Use MTU if total length of the message fits into the MTU.
1161 * Otherwise, we need to reserve fragment header and
1162 * fragment alignment (= 8-15 octects, in total).
1164 * Note that we may need to "move" the data from the tail of
1165 * of the buffer to the new fragment when we split
1168 * FIXME: It may be fragmented into multiple chunks
1169 * at once if non-fragmentable extension headers
1174 inet->cork.length += length;
1175 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1176 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1178 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1179 fragheaderlen, transhdrlen, mtu,
1186 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1189 while (length > 0) {
1190 /* Check if the remaining data fits into current packet. */
1191 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1193 copy = maxfraglen - skb->len;
1197 unsigned int datalen;
1198 unsigned int fraglen;
1199 unsigned int fraggap;
1200 unsigned int alloclen;
1201 struct sk_buff *skb_prev;
1205 /* There's no room in the current skb */
1207 fraggap = skb_prev->len - maxfraglen;
1212 * If remaining data exceeds the mtu,
1213 * we know we need more fragment(s).
1215 datalen = length + fraggap;
1216 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1217 datalen = maxfraglen - fragheaderlen;
1219 fraglen = datalen + fragheaderlen;
1220 if ((flags & MSG_MORE) &&
1221 !(rt->u.dst.dev->features&NETIF_F_SG))
1224 alloclen = datalen + fragheaderlen;
1227 * The last fragment gets additional space at tail.
1228 * Note: we overallocate on fragments with MSG_MODE
1229 * because we have no idea if we're the last one.
1231 if (datalen == length + fraggap)
1232 alloclen += rt->u.dst.trailer_len;
1235 * We just reserve space for fragment header.
1236 * Note: this may be overallocation if the message
1237 * (without MSG_MORE) fits into the MTU.
1239 alloclen += sizeof(struct frag_hdr);
1242 skb = sock_alloc_send_skb(sk,
1244 (flags & MSG_DONTWAIT), &err);
1247 if (atomic_read(&sk->sk_wmem_alloc) <=
1249 skb = sock_wmalloc(sk,
1250 alloclen + hh_len, 1,
1252 if (unlikely(skb == NULL))
1258 * Fill in the control structures
1260 skb->ip_summed = csummode;
1262 /* reserve for fragmentation */
1263 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1266 * Find where to start putting bytes
1268 data = skb_put(skb, fraglen);
1269 skb_set_network_header(skb, exthdrlen);
1270 data += fragheaderlen;
1271 skb->transport_header = (skb->network_header +
1274 skb->csum = skb_copy_and_csum_bits(
1275 skb_prev, maxfraglen,
1276 data + transhdrlen, fraggap, 0);
1277 skb_prev->csum = csum_sub(skb_prev->csum,
1280 pskb_trim_unique(skb_prev, maxfraglen);
1282 copy = datalen - transhdrlen - fraggap;
1287 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1294 length -= datalen - fraggap;
1297 csummode = CHECKSUM_NONE;
1300 * Put the packet on the pending queue
1302 __skb_queue_tail(&sk->sk_write_queue, skb);
1309 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1313 if (getfrag(from, skb_put(skb, copy),
1314 offset, copy, off, skb) < 0) {
1315 __skb_trim(skb, off);
1320 int i = skb_shinfo(skb)->nr_frags;
1321 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1322 struct page *page = sk->sk_sndmsg_page;
1323 int off = sk->sk_sndmsg_off;
1326 if (page && (left = PAGE_SIZE - off) > 0) {
1329 if (page != frag->page) {
1330 if (i == MAX_SKB_FRAGS) {
1335 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1336 frag = &skb_shinfo(skb)->frags[i];
1338 } else if(i < MAX_SKB_FRAGS) {
1339 if (copy > PAGE_SIZE)
1341 page = alloc_pages(sk->sk_allocation, 0);
1346 sk->sk_sndmsg_page = page;
1347 sk->sk_sndmsg_off = 0;
1349 skb_fill_page_desc(skb, i, page, 0, 0);
1350 frag = &skb_shinfo(skb)->frags[i];
1355 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1359 sk->sk_sndmsg_off += copy;
1362 skb->data_len += copy;
1363 skb->truesize += copy;
1364 atomic_add(copy, &sk->sk_wmem_alloc);
1371 inet->cork.length -= length;
1372 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1376 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1378 inet->cork.flags &= ~IPCORK_OPT;
1379 kfree(np->cork.opt);
1380 np->cork.opt = NULL;
1382 dst_release(&np->cork.rt->u.dst);
1384 inet->cork.flags &= ~IPCORK_ALLFRAG;
1386 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1389 int ip6_push_pending_frames(struct sock *sk)
1391 struct sk_buff *skb, *tmp_skb;
1392 struct sk_buff **tail_skb;
1393 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1394 struct inet_sock *inet = inet_sk(sk);
1395 struct ipv6_pinfo *np = inet6_sk(sk);
1396 struct ipv6hdr *hdr;
1397 struct ipv6_txoptions *opt = np->cork.opt;
1398 struct rt6_info *rt = np->cork.rt;
1399 struct flowi *fl = &inet->cork.fl;
1400 unsigned char proto = fl->proto;
1403 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1405 tail_skb = &(skb_shinfo(skb)->frag_list);
1407 /* move skb->data to ip header from ext header */
1408 if (skb->data < skb_network_header(skb))
1409 __skb_pull(skb, skb_network_offset(skb));
1410 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1411 __skb_pull(tmp_skb, skb_network_header_len(skb));
1412 *tail_skb = tmp_skb;
1413 tail_skb = &(tmp_skb->next);
1414 skb->len += tmp_skb->len;
1415 skb->data_len += tmp_skb->len;
1416 skb->truesize += tmp_skb->truesize;
1417 __sock_put(tmp_skb->sk);
1418 tmp_skb->destructor = NULL;
1422 /* Allow local fragmentation. */
1423 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1426 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1427 __skb_pull(skb, skb_network_header_len(skb));
1428 if (opt && opt->opt_flen)
1429 ipv6_push_frag_opts(skb, opt, &proto);
1430 if (opt && opt->opt_nflen)
1431 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1433 skb_push(skb, sizeof(struct ipv6hdr));
1434 skb_reset_network_header(skb);
1435 hdr = ipv6_hdr(skb);
1437 *(__be32*)hdr = fl->fl6_flowlabel |
1438 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1440 hdr->hop_limit = np->cork.hop_limit;
1441 hdr->nexthdr = proto;
1442 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1443 ipv6_addr_copy(&hdr->daddr, final_dst);
1445 skb->priority = sk->sk_priority;
1446 skb->mark = sk->sk_mark;
1448 skb->dst = dst_clone(&rt->u.dst);
1449 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1450 if (proto == IPPROTO_ICMPV6) {
1451 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1453 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1454 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1457 err = ip6_local_out(skb);
1460 err = np->recverr ? net_xmit_errno(err) : 0;
1466 ip6_cork_release(inet, np);
1472 void ip6_flush_pending_frames(struct sock *sk)
1474 struct sk_buff *skb;
1476 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1478 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1479 IPSTATS_MIB_OUTDISCARDS);
1483 ip6_cork_release(inet_sk(sk), inet6_sk(sk));