2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 * See ip_input.c for original log
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * Hirokazu Takahashi: sendfile() on UDP works now.
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
71 #include <linux/skbuff.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 EXPORT_SYMBOL(ip_send_check);
95 int __ip_local_out(struct sk_buff *skb)
97 struct iphdr *iph = ip_hdr(skb);
99 iph->tot_len = htons(skb->len);
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
105 int ip_local_out(struct sk_buff *skb)
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
115 EXPORT_SYMBOL_GPL(ip_local_out);
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
120 skb_reset_mac_header(newskb);
121 __skb_pull(newskb, skb_network_offset(newskb));
122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 WARN_ON(!skb_dst(newskb));
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131 int ttl = inet->uc_ttl;
134 ttl = ip4_dst_hoplimit(dst);
139 * Add an ip header to a skbuff and send it out.
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 __be32 saddr, __be32 daddr, struct ip_options *opt)
145 struct inet_sock *inet = inet_sk(sk);
146 struct rtable *rt = skb_rtable(skb);
149 /* Build the IP header. */
150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
151 skb_reset_network_header(skb);
155 iph->tos = inet->tos;
156 if (ip_dont_fragment(sk, &rt->dst))
157 iph->frag_off = htons(IP_DF);
160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = rt->rt_dst;
162 iph->saddr = rt->rt_src;
163 iph->protocol = sk->sk_protocol;
164 ip_select_ident(iph, &rt->dst, sk);
166 if (opt && opt->optlen) {
167 iph->ihl += opt->optlen>>2;
168 ip_options_build(skb, opt, daddr, rt, 0);
171 skb->priority = sk->sk_priority;
172 skb->mark = sk->sk_mark;
175 return ip_local_out(skb);
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179 static inline int ip_finish_output2(struct sk_buff *skb)
181 struct dst_entry *dst = skb_dst(skb);
182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
186 if (rt->rt_type == RTN_MULTICAST) {
187 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188 } else if (rt->rt_type == RTN_BROADCAST)
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191 /* Be paranoid, rather than too clever. */
192 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193 struct sk_buff *skb2;
195 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
201 skb_set_owner_w(skb2, skb->sk);
207 return neigh_hh_output(dst->hh, skb);
208 else if (dst->neighbour)
209 return dst->neighbour->output(skb);
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
219 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
221 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
225 static int ip_finish_output(struct sk_buff *skb)
227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228 /* Policy lookup after SNAT yielded a new policy */
229 if (skb_dst(skb)->xfrm != NULL) {
230 IPCB(skb)->flags |= IPSKB_REROUTED;
231 return dst_output(skb);
234 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235 return ip_fragment(skb, ip_finish_output2);
237 return ip_finish_output2(skb);
240 int ip_mc_output(struct sk_buff *skb)
242 struct sock *sk = skb->sk;
243 struct rtable *rt = skb_rtable(skb);
244 struct net_device *dev = rt->dst.dev;
247 * If the indicated interface is up and running, send the packet.
249 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
252 skb->protocol = htons(ETH_P_IP);
255 * Multicasts are looped back for other local users
258 if (rt->rt_flags&RTCF_MULTICAST) {
260 #ifdef CONFIG_IP_MROUTE
261 /* Small optimization: do not loopback not local frames,
262 which returned after forwarding; they will be dropped
263 by ip_mr_input in any case.
264 Note, that local frames are looped back to be delivered
267 This check is duplicated in ip_mr_input at the moment.
270 ((rt->rt_flags & RTCF_LOCAL) ||
271 !(IPCB(skb)->flags & IPSKB_FORWARDED))
274 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
276 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277 newskb, NULL, newskb->dev,
278 ip_dev_loopback_xmit);
281 /* Multicasts with ttl 0 must not go beyond the host */
283 if (ip_hdr(skb)->ttl == 0) {
289 if (rt->rt_flags&RTCF_BROADCAST) {
290 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
292 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293 NULL, newskb->dev, ip_dev_loopback_xmit);
296 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297 skb->dev, ip_finish_output,
298 !(IPCB(skb)->flags & IPSKB_REROUTED));
301 int ip_output(struct sk_buff *skb)
303 struct net_device *dev = skb_dst(skb)->dev;
305 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
308 skb->protocol = htons(ETH_P_IP);
310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
312 !(IPCB(skb)->flags & IPSKB_REROUTED));
315 int ip_queue_xmit(struct sk_buff *skb)
317 struct sock *sk = skb->sk;
318 struct inet_sock *inet = inet_sk(sk);
319 struct ip_options *opt = inet->opt;
324 /* Skip all of this if the packet is already routed,
325 * f.e. by something like SCTP.
328 rt = skb_rtable(skb);
332 /* Make sure we can route this packet. */
333 rt = (struct rtable *)__sk_dst_check(sk, 0);
337 /* Use correct destination address if we have options. */
338 daddr = inet->inet_daddr;
343 struct flowi fl = { .oif = sk->sk_bound_dev_if,
346 .fl4_src = inet->inet_saddr,
347 .fl4_tos = RT_CONN_FLAGS(sk),
348 .proto = sk->sk_protocol,
349 .flags = inet_sk_flowi_flags(sk),
350 .fl_ip_sport = inet->inet_sport,
351 .fl_ip_dport = inet->inet_dport };
353 /* If this fails, retransmit mechanism of transport layer will
354 * keep trying until route appears or the connection times
357 security_sk_classify_flow(sk, &fl);
358 rt = ip_route_output_flow(sock_net(sk), &fl, sk);
362 sk_setup_caps(sk, &rt->dst);
364 skb_dst_set_noref(skb, &rt->dst);
367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
370 /* OK, we know where to send it, allocate and build IP header. */
371 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
372 skb_reset_network_header(skb);
374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
375 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
376 iph->frag_off = htons(IP_DF);
379 iph->ttl = ip_select_ttl(inet, &rt->dst);
380 iph->protocol = sk->sk_protocol;
381 iph->saddr = rt->rt_src;
382 iph->daddr = rt->rt_dst;
383 /* Transport layer set skb->h.foo itself. */
385 if (opt && opt->optlen) {
386 iph->ihl += opt->optlen >> 2;
387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
390 ip_select_ident_more(iph, &rt->dst, sk,
391 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
393 skb->priority = sk->sk_priority;
394 skb->mark = sk->sk_mark;
396 res = ip_local_out(skb);
402 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
404 return -EHOSTUNREACH;
406 EXPORT_SYMBOL(ip_queue_xmit);
409 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
411 to->pkt_type = from->pkt_type;
412 to->priority = from->priority;
413 to->protocol = from->protocol;
415 skb_dst_copy(to, from);
417 to->mark = from->mark;
419 /* Copy the flags to each fragment. */
420 IPCB(to)->flags = IPCB(from)->flags;
422 #ifdef CONFIG_NET_SCHED
423 to->tc_index = from->tc_index;
426 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
427 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
428 to->nf_trace = from->nf_trace;
430 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
431 to->ipvs_property = from->ipvs_property;
433 skb_copy_secmark(to, from);
437 * This IP datagram is too large to be sent in one piece. Break it up into
438 * smaller pieces (each of size equal to IP header plus
439 * a block of the data of the original IP data part) that will yet fit in a
440 * single device frame, and queue such a frame for sending.
443 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
447 struct net_device *dev;
448 struct sk_buff *skb2;
449 unsigned int mtu, hlen, left, len, ll_rs;
451 __be16 not_last_frag;
452 struct rtable *rt = skb_rtable(skb);
458 * Point into the IP datagram header.
463 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
464 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
465 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
466 htonl(ip_skb_dst_mtu(skb)));
472 * Setup starting values.
476 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
477 #ifdef CONFIG_BRIDGE_NETFILTER
479 mtu -= nf_bridge_mtu_reduction(skb);
481 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
483 /* When frag_list is given, use it. First, check its validity:
484 * some transformers could create wrong frag_list or break existing
485 * one, it is not prohibited. In this case fall back to copying.
487 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment.
490 if (skb_has_frag_list(skb)) {
491 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb);
494 if (first_len - hlen > mtu ||
495 ((first_len - hlen) & 7) ||
496 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
500 skb_walk_frags(skb, frag) {
501 /* Correct geometry. */
502 if (frag->len > mtu ||
503 ((frag->len & 7) && frag->next) ||
504 skb_headroom(frag) < hlen)
505 goto slow_path_clean;
507 /* Partially cloned skb? */
508 if (skb_shared(frag))
509 goto slow_path_clean;
514 frag->destructor = sock_wfree;
516 skb->truesize -= frag->truesize;
519 /* Everything is OK. Generate! */
523 frag = skb_shinfo(skb)->frag_list;
524 skb_frag_list_init(skb);
525 skb->data_len = first_len - skb_headlen(skb);
526 skb->len = first_len;
527 iph->tot_len = htons(first_len);
528 iph->frag_off = htons(IP_MF);
532 /* Prepare header of the next frame,
533 * before previous one went down. */
535 frag->ip_summed = CHECKSUM_NONE;
536 skb_reset_transport_header(frag);
537 __skb_push(frag, hlen);
538 skb_reset_network_header(frag);
539 memcpy(skb_network_header(frag), iph, hlen);
541 iph->tot_len = htons(frag->len);
542 ip_copy_metadata(frag, skb);
544 ip_options_fragment(frag);
545 offset += skb->len - hlen;
546 iph->frag_off = htons(offset>>3);
547 if (frag->next != NULL)
548 iph->frag_off |= htons(IP_MF);
549 /* Ready, complete checksum */
556 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
566 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
575 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
579 skb_walk_frags(skb, frag2) {
583 frag2->destructor = NULL;
584 skb->truesize += frag2->truesize;
589 left = skb->len - hlen; /* Space per frame */
590 ptr = hlen; /* Where to start from */
592 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
593 * we need to make room for the encapsulating header
595 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
598 * Fragment the datagram.
601 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
602 not_last_frag = iph->frag_off & htons(IP_MF);
605 * Keep copying data until we run out.
610 /* IF: it doesn't fit, use 'mtu' - the data space left */
613 /* IF: we are not sending upto and including the packet end
614 then align the next start on an eight byte boundary */
622 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
623 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
629 * Set up data on packet
632 ip_copy_metadata(skb2, skb);
633 skb_reserve(skb2, ll_rs);
634 skb_put(skb2, len + hlen);
635 skb_reset_network_header(skb2);
636 skb2->transport_header = skb2->network_header + hlen;
639 * Charge the memory for the fragment to any owner
644 skb_set_owner_w(skb2, skb->sk);
647 * Copy the packet header into the new buffer.
650 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
653 * Copy a block of the IP datagram.
655 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
660 * Fill in the new header fields.
663 iph->frag_off = htons((offset >> 3));
665 /* ANK: dirty, but effective trick. Upgrade options only if
666 * the segment to be fragmented was THE FIRST (otherwise,
667 * options are already fixed) and make it ONCE
668 * on the initial skb, so that all the following fragments
669 * will inherit fixed options.
672 ip_options_fragment(skb);
675 * Added AC : If we are fragmenting a fragment that's not the
676 * last fragment then keep MF on each bit
678 if (left > 0 || not_last_frag)
679 iph->frag_off |= htons(IP_MF);
684 * Put this fragment into the sending queue.
686 iph->tot_len = htons(len + hlen);
694 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
702 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
705 EXPORT_SYMBOL(ip_fragment);
708 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
710 struct iovec *iov = from;
712 if (skb->ip_summed == CHECKSUM_PARTIAL) {
713 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
717 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
719 skb->csum = csum_block_add(skb->csum, csum, odd);
723 EXPORT_SYMBOL(ip_generic_getfrag);
726 csum_page(struct page *page, int offset, int copy)
731 csum = csum_partial(kaddr + offset, copy, 0);
736 static inline int ip_ufo_append_data(struct sock *sk,
737 struct sk_buff_head *queue,
738 int getfrag(void *from, char *to, int offset, int len,
739 int odd, struct sk_buff *skb),
740 void *from, int length, int hh_len, int fragheaderlen,
741 int transhdrlen, int mtu, unsigned int flags)
746 /* There is support for UDP fragmentation offload by network
747 * device, so create one single skb packet containing complete
750 if ((skb = skb_peek_tail(queue)) == NULL) {
751 skb = sock_alloc_send_skb(sk,
752 hh_len + fragheaderlen + transhdrlen + 20,
753 (flags & MSG_DONTWAIT), &err);
758 /* reserve space for Hardware header */
759 skb_reserve(skb, hh_len);
761 /* create space for UDP/IP header */
762 skb_put(skb, fragheaderlen + transhdrlen);
764 /* initialize network header pointer */
765 skb_reset_network_header(skb);
767 /* initialize protocol header pointer */
768 skb->transport_header = skb->network_header + fragheaderlen;
770 skb->ip_summed = CHECKSUM_PARTIAL;
773 /* specify the length of each IP datagram fragment */
774 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
775 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
776 __skb_queue_tail(queue, skb);
779 return skb_append_datato_frags(sk, skb, getfrag, from,
780 (length - transhdrlen));
783 static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
784 struct inet_cork *cork,
785 int getfrag(void *from, char *to, int offset,
786 int len, int odd, struct sk_buff *skb),
787 void *from, int length, int transhdrlen,
790 struct inet_sock *inet = inet_sk(sk);
793 struct ip_options *opt = cork->opt;
800 unsigned int maxfraglen, fragheaderlen;
801 int csummode = CHECKSUM_NONE;
802 struct rtable *rt = (struct rtable *)cork->dst;
804 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
806 transhdrlen += exthdrlen;
807 mtu = cork->fragsize;
809 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
811 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
812 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
814 if (cork->length + length > 0xFFFF - fragheaderlen) {
815 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
821 * transhdrlen > 0 means that this is the first fragment and we wish
822 * it won't be fragmented in the future.
825 length + fragheaderlen <= mtu &&
826 rt->dst.dev->features & NETIF_F_V4_CSUM &&
828 csummode = CHECKSUM_PARTIAL;
830 skb = skb_peek_tail(queue);
832 cork->length += length;
833 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
834 (sk->sk_protocol == IPPROTO_UDP) &&
835 (rt->dst.dev->features & NETIF_F_UFO)) {
836 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
837 hh_len, fragheaderlen, transhdrlen,
844 /* So, what's going on in the loop below?
846 * We use calculated fragment length to generate chained skb,
847 * each of segments is IP fragment ready for sending to network after
848 * adding appropriate IP header.
855 /* Check if the remaining data fits into current packet. */
856 copy = mtu - skb->len;
858 copy = maxfraglen - skb->len;
861 unsigned int datalen;
862 unsigned int fraglen;
863 unsigned int fraggap;
864 unsigned int alloclen;
865 struct sk_buff *skb_prev;
869 fraggap = skb_prev->len - maxfraglen;
874 * If remaining data exceeds the mtu,
875 * we know we need more fragment(s).
877 datalen = length + fraggap;
878 if (datalen > mtu - fragheaderlen)
879 datalen = maxfraglen - fragheaderlen;
880 fraglen = datalen + fragheaderlen;
882 if ((flags & MSG_MORE) &&
883 !(rt->dst.dev->features&NETIF_F_SG))
888 /* The last fragment gets additional space at tail.
889 * Note, with MSG_MORE we overallocate on fragments,
890 * because we have no idea what fragment will be
893 if (datalen == length + fraggap) {
894 alloclen += rt->dst.trailer_len;
895 /* make sure mtu is not reached */
896 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
897 datalen -= ALIGN(rt->dst.trailer_len, 8);
900 skb = sock_alloc_send_skb(sk,
901 alloclen + hh_len + 15,
902 (flags & MSG_DONTWAIT), &err);
905 if (atomic_read(&sk->sk_wmem_alloc) <=
907 skb = sock_wmalloc(sk,
908 alloclen + hh_len + 15, 1,
910 if (unlikely(skb == NULL))
913 /* only the initial fragment is
921 * Fill in the control structures
923 skb->ip_summed = csummode;
925 skb_reserve(skb, hh_len);
926 skb_shinfo(skb)->tx_flags = cork->tx_flags;
929 * Find where to start putting bytes.
931 data = skb_put(skb, fraglen);
932 skb_set_network_header(skb, exthdrlen);
933 skb->transport_header = (skb->network_header +
935 data += fragheaderlen;
938 skb->csum = skb_copy_and_csum_bits(
939 skb_prev, maxfraglen,
940 data + transhdrlen, fraggap, 0);
941 skb_prev->csum = csum_sub(skb_prev->csum,
944 pskb_trim_unique(skb_prev, maxfraglen);
947 copy = datalen - transhdrlen - fraggap;
948 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
955 length -= datalen - fraggap;
958 csummode = CHECKSUM_NONE;
961 * Put the packet on the pending queue.
963 __skb_queue_tail(queue, skb);
970 if (!(rt->dst.dev->features&NETIF_F_SG)) {
974 if (getfrag(from, skb_put(skb, copy),
975 offset, copy, off, skb) < 0) {
976 __skb_trim(skb, off);
981 int i = skb_shinfo(skb)->nr_frags;
982 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
983 struct page *page = cork->page;
987 if (page && (left = PAGE_SIZE - off) > 0) {
990 if (page != frag->page) {
991 if (i == MAX_SKB_FRAGS) {
996 skb_fill_page_desc(skb, i, page, off, 0);
997 frag = &skb_shinfo(skb)->frags[i];
999 } else if (i < MAX_SKB_FRAGS) {
1000 if (copy > PAGE_SIZE)
1002 page = alloc_pages(sk->sk_allocation, 0);
1010 skb_fill_page_desc(skb, i, page, 0, 0);
1011 frag = &skb_shinfo(skb)->frags[i];
1016 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1023 skb->data_len += copy;
1024 skb->truesize += copy;
1025 atomic_add(copy, &sk->sk_wmem_alloc);
1034 cork->length -= length;
1035 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1039 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1040 struct ipcm_cookie *ipc, struct rtable **rtp)
1042 struct inet_sock *inet = inet_sk(sk);
1043 struct ip_options *opt;
1047 * setup for corking.
1051 if (cork->opt == NULL) {
1052 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1054 if (unlikely(cork->opt == NULL))
1057 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1058 cork->flags |= IPCORK_OPT;
1059 cork->addr = ipc->addr;
1065 * We steal reference to this route, caller should not release it
1068 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1069 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1070 cork->dst = &rt->dst;
1072 cork->tx_flags = ipc->tx_flags;
1080 * ip_append_data() and ip_append_page() can make one large IP datagram
1081 * from many pieces of data. Each pieces will be holded on the socket
1082 * until ip_push_pending_frames() is called. Each piece can be a page
1085 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1086 * this interface potentially.
1088 * LATER: length must be adjusted by pad at tail, when it is required.
1090 int ip_append_data(struct sock *sk,
1091 int getfrag(void *from, char *to, int offset, int len,
1092 int odd, struct sk_buff *skb),
1093 void *from, int length, int transhdrlen,
1094 struct ipcm_cookie *ipc, struct rtable **rtp,
1097 struct inet_sock *inet = inet_sk(sk);
1100 if (flags&MSG_PROBE)
1103 if (skb_queue_empty(&sk->sk_write_queue)) {
1104 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1111 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1112 from, length, transhdrlen, flags);
1115 ssize_t ip_append_page(struct sock *sk, struct page *page,
1116 int offset, size_t size, int flags)
1118 struct inet_sock *inet = inet_sk(sk);
1119 struct sk_buff *skb;
1121 struct ip_options *opt = NULL;
1126 unsigned int maxfraglen, fragheaderlen, fraggap;
1131 if (flags&MSG_PROBE)
1134 if (skb_queue_empty(&sk->sk_write_queue))
1137 rt = (struct rtable *)inet->cork.dst;
1138 if (inet->cork.flags & IPCORK_OPT)
1139 opt = inet->cork.opt;
1141 if (!(rt->dst.dev->features&NETIF_F_SG))
1144 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1145 mtu = inet->cork.fragsize;
1147 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1148 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1150 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1151 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1155 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1158 inet->cork.length += size;
1159 if ((size + skb->len > mtu) &&
1160 (sk->sk_protocol == IPPROTO_UDP) &&
1161 (rt->dst.dev->features & NETIF_F_UFO)) {
1162 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1163 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1170 if (skb_is_gso(skb))
1174 /* Check if the remaining data fits into current packet. */
1175 len = mtu - skb->len;
1177 len = maxfraglen - skb->len;
1180 struct sk_buff *skb_prev;
1184 fraggap = skb_prev->len - maxfraglen;
1186 alloclen = fragheaderlen + hh_len + fraggap + 15;
1187 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1188 if (unlikely(!skb)) {
1194 * Fill in the control structures
1196 skb->ip_summed = CHECKSUM_NONE;
1198 skb_reserve(skb, hh_len);
1201 * Find where to start putting bytes.
1203 skb_put(skb, fragheaderlen + fraggap);
1204 skb_reset_network_header(skb);
1205 skb->transport_header = (skb->network_header +
1208 skb->csum = skb_copy_and_csum_bits(skb_prev,
1210 skb_transport_header(skb),
1212 skb_prev->csum = csum_sub(skb_prev->csum,
1214 pskb_trim_unique(skb_prev, maxfraglen);
1218 * Put the packet on the pending queue.
1220 __skb_queue_tail(&sk->sk_write_queue, skb);
1224 i = skb_shinfo(skb)->nr_frags;
1227 if (skb_can_coalesce(skb, i, page, offset)) {
1228 skb_shinfo(skb)->frags[i-1].size += len;
1229 } else if (i < MAX_SKB_FRAGS) {
1231 skb_fill_page_desc(skb, i, page, offset, len);
1237 if (skb->ip_summed == CHECKSUM_NONE) {
1239 csum = csum_page(page, offset, len);
1240 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1244 skb->data_len += len;
1245 skb->truesize += len;
1246 atomic_add(len, &sk->sk_wmem_alloc);
1253 inet->cork.length -= size;
1254 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1258 static void ip_cork_release(struct inet_cork *cork)
1260 cork->flags &= ~IPCORK_OPT;
1263 dst_release(cork->dst);
1268 * Combined all pending IP fragments on the socket as one IP datagram
1269 * and push them out.
1271 struct sk_buff *__ip_make_skb(struct sock *sk,
1272 struct sk_buff_head *queue,
1273 struct inet_cork *cork)
1275 struct sk_buff *skb, *tmp_skb;
1276 struct sk_buff **tail_skb;
1277 struct inet_sock *inet = inet_sk(sk);
1278 struct net *net = sock_net(sk);
1279 struct ip_options *opt = NULL;
1280 struct rtable *rt = (struct rtable *)cork->dst;
1285 if ((skb = __skb_dequeue(queue)) == NULL)
1287 tail_skb = &(skb_shinfo(skb)->frag_list);
1289 /* move skb->data to ip header from ext header */
1290 if (skb->data < skb_network_header(skb))
1291 __skb_pull(skb, skb_network_offset(skb));
1292 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1293 __skb_pull(tmp_skb, skb_network_header_len(skb));
1294 *tail_skb = tmp_skb;
1295 tail_skb = &(tmp_skb->next);
1296 skb->len += tmp_skb->len;
1297 skb->data_len += tmp_skb->len;
1298 skb->truesize += tmp_skb->truesize;
1299 tmp_skb->destructor = NULL;
1303 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1304 * to fragment the frame generated here. No matter, what transforms
1305 * how transforms change size of the packet, it will come out.
1307 if (inet->pmtudisc < IP_PMTUDISC_DO)
1310 /* DF bit is set when we want to see DF on outgoing frames.
1311 * If local_df is set too, we still allow to fragment this frame
1313 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1314 (skb->len <= dst_mtu(&rt->dst) &&
1315 ip_dont_fragment(sk, &rt->dst)))
1318 if (cork->flags & IPCORK_OPT)
1321 if (rt->rt_type == RTN_MULTICAST)
1324 ttl = ip_select_ttl(inet, &rt->dst);
1326 iph = (struct iphdr *)skb->data;
1330 iph->ihl += opt->optlen>>2;
1331 ip_options_build(skb, opt, cork->addr, rt, 0);
1333 iph->tos = inet->tos;
1335 ip_select_ident(iph, &rt->dst, sk);
1337 iph->protocol = sk->sk_protocol;
1338 iph->saddr = rt->rt_src;
1339 iph->daddr = rt->rt_dst;
1341 skb->priority = sk->sk_priority;
1342 skb->mark = sk->sk_mark;
1344 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1348 skb_dst_set(skb, &rt->dst);
1350 if (iph->protocol == IPPROTO_ICMP)
1351 icmp_out_count(net, ((struct icmphdr *)
1352 skb_transport_header(skb))->type);
1354 ip_cork_release(cork);
1359 int ip_send_skb(struct sk_buff *skb)
1361 struct net *net = sock_net(skb->sk);
1364 err = ip_local_out(skb);
1367 err = net_xmit_errno(err);
1369 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1375 int ip_push_pending_frames(struct sock *sk)
1377 struct sk_buff *skb;
1379 skb = ip_finish_skb(sk);
1383 /* Netfilter gets whole the not fragmented skb. */
1384 return ip_send_skb(skb);
1388 * Throw away all pending data on the socket.
1390 static void __ip_flush_pending_frames(struct sock *sk,
1391 struct sk_buff_head *queue,
1392 struct inet_cork *cork)
1394 struct sk_buff *skb;
1396 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1399 ip_cork_release(cork);
1402 void ip_flush_pending_frames(struct sock *sk)
1404 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1407 struct sk_buff *ip_make_skb(struct sock *sk,
1408 int getfrag(void *from, char *to, int offset,
1409 int len, int odd, struct sk_buff *skb),
1410 void *from, int length, int transhdrlen,
1411 struct ipcm_cookie *ipc, struct rtable **rtp,
1414 struct inet_cork cork = {};
1415 struct sk_buff_head queue;
1418 if (flags & MSG_PROBE)
1421 __skb_queue_head_init(&queue);
1423 err = ip_setup_cork(sk, &cork, ipc, rtp);
1425 return ERR_PTR(err);
1427 err = __ip_append_data(sk, &queue, &cork, getfrag,
1428 from, length, transhdrlen, flags);
1430 __ip_flush_pending_frames(sk, &queue, &cork);
1431 return ERR_PTR(err);
1434 return __ip_make_skb(sk, &queue, &cork);
1438 * Fetch data from kernel space and fill in checksum if needed.
1440 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1441 int len, int odd, struct sk_buff *skb)
1445 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1446 skb->csum = csum_block_add(skb->csum, csum, odd);
1451 * Generic function to send a packet as reply to another packet.
1452 * Used to send TCP resets so far. ICMP should use this function too.
1454 * Should run single threaded per socket because it uses the sock
1455 * structure to pass arguments.
1457 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1460 struct inet_sock *inet = inet_sk(sk);
1462 struct ip_options opt;
1465 struct ipcm_cookie ipc;
1467 struct rtable *rt = skb_rtable(skb);
1469 if (ip_options_echo(&replyopts.opt, skb))
1472 daddr = ipc.addr = rt->rt_src;
1476 if (replyopts.opt.optlen) {
1477 ipc.opt = &replyopts.opt;
1480 daddr = replyopts.opt.faddr;
1484 struct flowi fl = { .oif = arg->bound_dev_if,
1486 .fl4_src = rt->rt_spec_dst,
1487 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1488 .fl_ip_sport = tcp_hdr(skb)->dest,
1489 .fl_ip_dport = tcp_hdr(skb)->source,
1490 .proto = sk->sk_protocol,
1491 .flags = ip_reply_arg_flowi_flags(arg) };
1492 security_skb_classify_flow(skb, &fl);
1493 rt = ip_route_output_key(sock_net(sk), &fl);
1498 /* And let IP do all the hard work.
1500 This chunk is not reenterable, hence spinlock.
1501 Note that it uses the fact, that this function is called
1502 with locally disabled BH and that sk cannot be already spinlocked.
1505 inet->tos = ip_hdr(skb)->tos;
1506 sk->sk_priority = skb->priority;
1507 sk->sk_protocol = ip_hdr(skb)->protocol;
1508 sk->sk_bound_dev_if = arg->bound_dev_if;
1509 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1510 &ipc, &rt, MSG_DONTWAIT);
1511 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1512 if (arg->csumoffset >= 0)
1513 *((__sum16 *)skb_transport_header(skb) +
1514 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1516 skb->ip_summed = CHECKSUM_NONE;
1517 ip_push_pending_frames(sk);
1525 void __init ip_init(void)
1530 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1531 igmp_mc_proc_init();