net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/errno.h>
  32 #include <linux/kernel.h>
  33 #include <linux/string.h>
  34 #include <linux/socket.h>
  35 #include <linux/net.h>
  36 #include <linux/netdevice.h>
  37 #include <linux/if_arp.h>
  38 #include <linux/in6.h>
  39 #include <linux/tcp.h>
  40 #include <linux/route.h>
  41 #include <linux/module.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 int __ip6_local_out(struct sk_buff *skb)
  74 {
  75         int len;
  76
  77         len = skb->len - sizeof(struct ipv6hdr);
  78         if (len > IPV6_MAXPLEN)
  79                 len = 0;
  80         ipv6_hdr(skb)->payload_len = htons(len);
  81
  82         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  83                        dst_output);
  84 }
  85
  86 int ip6_local_out(struct sk_buff *skb)
  87 {
  88         int err;
  89
  90         err = __ip6_local_out(skb);
  91         if (likely(err == 1))
  92                 err = dst_output(skb);
  93
  94         return err;
  95 }
  96 EXPORT_SYMBOL_GPL(ip6_local_out);
  97
  98 static int ip6_output_finish(struct sk_buff *skb)
  99 {
 100         struct dst_entry *dst = skb->dst;
 101
 102         if (dst->hh)
 103                 return neigh_hh_output(dst->hh, skb);
 104         else if (dst->neighbour)
 105                 return dst->neighbour->output(skb);
 106
 107         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 108         kfree_skb(skb);
 109         return -EINVAL;
 110
 111 }
 112
 113 /* dev_loopback_xmit for use with netfilter. */
 114 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 115 {
 116         skb_reset_mac_header(newskb);
 117         __skb_pull(newskb, skb_network_offset(newskb));
 118         newskb->pkt_type = PACKET_LOOPBACK;
 119         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 120         BUG_TRAP(newskb->dst);
 121
 122         netif_rx(newskb);
 123         return 0;
 124 }
 125
 126
 127 static int ip6_output2(struct sk_buff *skb)
 128 {
 129         struct dst_entry *dst = skb->dst;
 130         struct net_device *dev = dst->dev;
 131
 132         skb->protocol = htons(ETH_P_IPV6);
 133         skb->dev = dev;
 134
 135         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 136                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 137                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 138
 139                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 140                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                         &ipv6_hdr(skb)->saddr)) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb->dst))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb->dst;
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit, tclass;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         if (sk)
 220                                 skb_set_owner_w(skb, sk);
 221                 }
 222                 if (opt->opt_flen)
 223                         ipv6_push_frag_opts(skb, opt, &proto);
 224                 if (opt->opt_nflen)
 225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /*
 233          *      Fill in the IPv6 header
 234          */
 235
 236         hlimit = -1;
 237         if (np)
 238                 hlimit = np->hop_limit;
 239         if (hlimit < 0)
 240                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 241         if (hlimit < 0)
 242                 hlimit = ipv6_get_hoplimit(dst->dev);
 243
 244         tclass = -1;
 245         if (np)
 246                 tclass = np->tclass;
 247         if (tclass < 0)
 248                 tclass = 0;
 249
 250         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 251
 252         hdr->payload_len = htons(seg_len);
 253         hdr->nexthdr = proto;
 254         hdr->hop_limit = hlimit;
 255
 256         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 257         ipv6_addr_copy(&hdr->daddr, first_hop);
 258
 259         skb->priority = sk->sk_priority;
 260
 261         mtu = dst_mtu(dst);
 262         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
 263                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 264                               IPSTATS_MIB_OUTREQUESTS);
 265                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 266                                 dst_output);
 267         }
 268
 269         if (net_ratelimit())
 270                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 271         skb->dev = dst->dev;
 272         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 273         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 274         kfree_skb(skb);
 275         return -EMSGSIZE;
 276 }
 277
 278 EXPORT_SYMBOL(ip6_xmit);
 279
 280 /*
 281  *      To avoid extra problems ND packets are send through this
 282  *      routine. It's code duplication but I really want to avoid
 283  *      extra checks since ipv6_build_header is used by TCP (which
 284  *      is for us performance critical)
 285  */
 286
 287 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 288                struct in6_addr *saddr, struct in6_addr *daddr,
 289                int proto, int len)
 290 {
 291         struct ipv6_pinfo *np = inet6_sk(sk);
 292         struct ipv6hdr *hdr;
 293         int totlen;
 294
 295         skb->protocol = htons(ETH_P_IPV6);
 296         skb->dev = dev;
 297
 298         totlen = len + sizeof(struct ipv6hdr);
 299
 300         skb_reset_network_header(skb);
 301         skb_put(skb, sizeof(struct ipv6hdr));
 302         hdr = ipv6_hdr(skb);
 303
 304         *(__be32*)hdr = htonl(0x60000000);
 305
 306         hdr->payload_len = htons(len);
 307         hdr->nexthdr = proto;
 308         hdr->hop_limit = np->hop_limit;
 309
 310         ipv6_addr_copy(&hdr->saddr, saddr);
 311         ipv6_addr_copy(&hdr->daddr, daddr);
 312
 313         return 0;
 314 }
 315
 316 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 317 {
 318         struct ip6_ra_chain *ra;
 319         struct sock *last = NULL;
 320
 321         read_lock(&ip6_ra_lock);
 322         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 323                 struct sock *sk = ra->sk;
 324                 if (sk && ra->sel == sel &&
 325                     (!sk->sk_bound_dev_if ||
 326                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 327                         if (last) {
 328                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 329                                 if (skb2)
 330                                         rawv6_rcv(last, skb2);
 331                         }
 332                         last = sk;
 333                 }
 334         }
 335
 336         if (last) {
 337                 rawv6_rcv(last, skb);
 338                 read_unlock(&ip6_ra_lock);
 339                 return 1;
 340         }
 341         read_unlock(&ip6_ra_lock);
 342         return 0;
 343 }
 344
 345 static int ip6_forward_proxy_check(struct sk_buff *skb)
 346 {
 347         struct ipv6hdr *hdr = ipv6_hdr(skb);
 348         u8 nexthdr = hdr->nexthdr;
 349         int offset;
 350
 351         if (ipv6_ext_hdr(nexthdr)) {
 352                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 353                 if (offset < 0)
 354                         return 0;
 355         } else
 356                 offset = sizeof(struct ipv6hdr);
 357
 358         if (nexthdr == IPPROTO_ICMPV6) {
 359                 struct icmp6hdr *icmp6;
 360
 361                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 362                                          offset + 1 - skb->data)))
 363                         return 0;
 364
 365                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 366
 367                 switch (icmp6->icmp6_type) {
 368                 case NDISC_ROUTER_SOLICITATION:
 369                 case NDISC_ROUTER_ADVERTISEMENT:
 370                 case NDISC_NEIGHBOUR_SOLICITATION:
 371                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 372                 case NDISC_REDIRECT:
 373                         /* For reaction involving unicast neighbor discovery
 374                          * message destined to the proxied address, pass it to
 375                          * input function.
 376                          */
 377                         return 1;
 378                 default:
 379                         break;
 380                 }
 381         }
 382
 383         /*
 384          * The proxying router can't forward traffic sent to a link-local
 385          * address, so signal the sender and discard the packet. This
 386          * behavior is clarified by the MIPv6 specification.
 387          */
 388         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 389                 dst_link_failure(skb);
 390                 return -1;
 391         }
 392
 393         return 0;
 394 }
 395
 396 static inline int ip6_forward_finish(struct sk_buff *skb)
 397 {
 398         return dst_output(skb);
 399 }
 400
 401 int ip6_forward(struct sk_buff *skb)
 402 {
 403         struct dst_entry *dst = skb->dst;
 404         struct ipv6hdr *hdr = ipv6_hdr(skb);
 405         struct inet6_skb_parm *opt = IP6CB(skb);
 406
 407         if (ipv6_devconf.forwarding == 0)
 408                 goto error;
 409
 410         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 411                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 412                 goto drop;
 413         }
 414
 415         skb_forward_csum(skb);
 416
 417         /*
 418          *      We DO NOT make any processing on
 419          *      RA packets, pushing them to user level AS IS
 420          *      without ane WARRANTY that application will be able
 421          *      to interpret them. The reason is that we
 422          *      cannot make anything clever here.
 423          *
 424          *      We are not end-node, so that if packet contains
 425          *      AH/ESP, we cannot make anything.
 426          *      Defragmentation also would be mistake, RA packets
 427          *      cannot be fragmented, because there is no warranty
 428          *      that different fragments will go along one path. --ANK
 429          */
 430         if (opt->ra) {
 431                 u8 *ptr = skb_network_header(skb) + opt->ra;
 432                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 433                         return 0;
 434         }
 435
 436         /*
 437          *      check and decrement ttl
 438          */
 439         if (hdr->hop_limit <= 1) {
 440                 /* Force OUTPUT device used as source address */
 441                 skb->dev = dst->dev;
 442                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 443                             0, skb->dev);
 444                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 445
 446                 kfree_skb(skb);
 447                 return -ETIMEDOUT;
 448         }
 449
 450         /* XXX: idev->cnf.proxy_ndp? */
 451         if (ipv6_devconf.proxy_ndp &&
 452             pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) {
 453                 int proxied = ip6_forward_proxy_check(skb);
 454                 if (proxied > 0)
 455                         return ip6_input(skb);
 456                 else if (proxied < 0) {
 457                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 458                         goto drop;
 459                 }
 460         }
 461
 462         if (!xfrm6_route_forward(skb)) {
 463                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 464                 goto drop;
 465         }
 466         dst = skb->dst;
 467
 468         /* IPv6 specs say nothing about it, but it is clear that we cannot
 469            send redirects to source routed frames.
 470            We don't send redirects to frames decapsulated from IPsec.
 471          */
 472         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 473             !skb->sp) {
 474                 struct in6_addr *target = NULL;
 475                 struct rt6_info *rt;
 476                 struct neighbour *n = dst->neighbour;
 477
 478                 /*
 479                  *      incoming and outgoing devices are the same
 480                  *      send a redirect.
 481                  */
 482
 483                 rt = (struct rt6_info *) dst;
 484                 if ((rt->rt6i_flags & RTF_GATEWAY))
 485                         target = (struct in6_addr*)&n->primary_key;
 486                 else
 487                         target = &hdr->daddr;
 488
 489                 /* Limit redirects both by destination (here)
 490                    and by source (inside ndisc_send_redirect)
 491                  */
 492                 if (xrlim_allow(dst, 1*HZ))
 493                         ndisc_send_redirect(skb, n, target);
 494         } else {
 495                 int addrtype = ipv6_addr_type(&hdr->saddr);
 496
 497                 /* This check is security critical. */
 498                 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
 499                         goto error;
 500                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 501                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 502                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 503                         goto error;
 504                 }
 505         }
 506
 507         if (skb->len > dst_mtu(dst)) {
 508                 /* Again, force OUTPUT device used as source address */
 509                 skb->dev = dst->dev;
 510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 511                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 512                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 513                 kfree_skb(skb);
 514                 return -EMSGSIZE;
 515         }
 516
 517         if (skb_cow(skb, dst->dev->hard_header_len)) {
 518                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 519                 goto drop;
 520         }
 521
 522         hdr = ipv6_hdr(skb);
 523
 524         /* Mangling hops number delayed to point after skb COW */
 525
 526         hdr->hop_limit--;
 527
 528         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 529         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 530                        ip6_forward_finish);
 531
 532 error:
 533         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 534 drop:
 535         kfree_skb(skb);
 536         return -EINVAL;
 537 }
 538
 539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 540 {
 541         to->pkt_type = from->pkt_type;
 542         to->priority = from->priority;
 543         to->protocol = from->protocol;
 544         dst_release(to->dst);
 545         to->dst = dst_clone(from->dst);
 546         to->dev = from->dev;
 547         to->mark = from->mark;
 548
 549 #ifdef CONFIG_NET_SCHED
 550         to->tc_index = from->tc_index;
 551 #endif
 552         nf_copy(to, from);
 553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 555         to->nf_trace = from->nf_trace;
 556 #endif
 557         skb_copy_secmark(to, from);
 558 }
 559
 560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 561 {
 562         u16 offset = sizeof(struct ipv6hdr);
 563         struct ipv6_opt_hdr *exthdr =
 564                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 565         unsigned int packet_len = skb->tail - skb->network_header;
 566         int found_rhdr = 0;
 567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 568
 569         while (offset + 1 <= packet_len) {
 570
 571                 switch (**nexthdr) {
 572
 573                 case NEXTHDR_HOP:
 574                         break;
 575                 case NEXTHDR_ROUTING:
 576                         found_rhdr = 1;
 577                         break;
 578                 case NEXTHDR_DEST:
 579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 580                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 581                                 break;
 582 #endif
 583                         if (found_rhdr)
 584                                 return offset;
 585                         break;
 586                 default :
 587                         return offset;
 588                 }
 589
 590                 offset += ipv6_optlen(exthdr);
 591                 *nexthdr = &exthdr->nexthdr;
 592                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 593                                                  offset);
 594         }
 595
 596         return offset;
 597 }
 598 EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
 599
 600 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 601 {
 602         struct net_device *dev;
 603         struct sk_buff *frag;
 604         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 605         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 606         struct ipv6hdr *tmp_hdr;
 607         struct frag_hdr *fh;
 608         unsigned int mtu, hlen, left, len;
 609         __be32 frag_id = 0;
 610         int ptr, offset = 0, err=0;
 611         u8 *prevhdr, nexthdr = 0;
 612
 613         dev = rt->u.dst.dev;
 614         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 615         nexthdr = *prevhdr;
 616
 617         mtu = ip6_skb_dst_mtu(skb);
 618
 619         /* We must not fragment if the socket is set to force MTU discovery
 620          * or if the skb it not generated by a local socket.  (This last
 621          * check should be redundant, but it's free.)
 622          */
 623         if (!np || np->pmtudisc >= IPV6_PMTUDISC_DO) {
 624                 skb->dev = skb->dst->dev;
 625                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 626                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 627                 kfree_skb(skb);
 628                 return -EMSGSIZE;
 629         }
 630
 631         if (np && np->frag_size < mtu) {
 632                 if (np->frag_size)
 633                         mtu = np->frag_size;
 634         }
 635         mtu -= hlen + sizeof(struct frag_hdr);
 636
 637         if (skb_shinfo(skb)->frag_list) {
 638                 int first_len = skb_pagelen(skb);
 639                 int truesizes = 0;
 640
 641                 if (first_len - hlen > mtu ||
 642                     ((first_len - hlen) & 7) ||
 643                     skb_cloned(skb))
 644                         goto slow_path;
 645
 646                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 647                         /* Correct geometry. */
 648                         if (frag->len > mtu ||
 649                             ((frag->len & 7) && frag->next) ||
 650                             skb_headroom(frag) < hlen)
 651                             goto slow_path;
 652
 653                         /* Partially cloned skb? */
 654                         if (skb_shared(frag))
 655                                 goto slow_path;
 656
 657                         BUG_ON(frag->sk);
 658                         if (skb->sk) {
 659                                 sock_hold(skb->sk);
 660                                 frag->sk = skb->sk;
 661                                 frag->destructor = sock_wfree;
 662                                 truesizes += frag->truesize;
 663                         }
 664                 }
 665
 666                 err = 0;
 667                 offset = 0;
 668                 frag = skb_shinfo(skb)->frag_list;
 669                 skb_shinfo(skb)->frag_list = NULL;
 670                 /* BUILD HEADER */
 671
 672                 *prevhdr = NEXTHDR_FRAGMENT;
 673                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 674                 if (!tmp_hdr) {
 675                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 676                         return -ENOMEM;
 677                 }
 678
 679                 __skb_pull(skb, hlen);
 680                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 681                 __skb_push(skb, hlen);
 682                 skb_reset_network_header(skb);
 683                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 684
 685                 ipv6_select_ident(skb, fh);
 686                 fh->nexthdr = nexthdr;
 687                 fh->reserved = 0;
 688                 fh->frag_off = htons(IP6_MF);
 689                 frag_id = fh->identification;
 690
 691                 first_len = skb_pagelen(skb);
 692                 skb->data_len = first_len - skb_headlen(skb);
 693                 skb->truesize -= truesizes;
 694                 skb->len = first_len;
 695                 ipv6_hdr(skb)->payload_len = htons(first_len -
 696                                                    sizeof(struct ipv6hdr));
 697
 698                 dst_hold(&rt->u.dst);
 699
 700                 for (;;) {
 701                         /* Prepare header of the next frame,
 702                          * before previous one went down. */
 703                         if (frag) {
 704                                 frag->ip_summed = CHECKSUM_NONE;
 705                                 skb_reset_transport_header(frag);
 706                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 707                                 __skb_push(frag, hlen);
 708                                 skb_reset_network_header(frag);
 709                                 memcpy(skb_network_header(frag), tmp_hdr,
 710                                        hlen);
 711                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 712                                 fh->nexthdr = nexthdr;
 713                                 fh->reserved = 0;
 714                                 fh->frag_off = htons(offset);
 715                                 if (frag->next != NULL)
 716                                         fh->frag_off |= htons(IP6_MF);
 717                                 fh->identification = frag_id;
 718                                 ipv6_hdr(frag)->payload_len =
 719                                                 htons(frag->len -
 720                                                       sizeof(struct ipv6hdr));
 721                                 ip6_copy_metadata(frag, skb);
 722                         }
 723
 724                         err = output(skb);
 725                         if(!err)
 726                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 727
 728                         if (err || !frag)
 729                                 break;
 730
 731                         skb = frag;
 732                         frag = skb->next;
 733                         skb->next = NULL;
 734                 }
 735
 736                 kfree(tmp_hdr);
 737
 738                 if (err == 0) {
 739                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 740                         dst_release(&rt->u.dst);
 741                         return 0;
 742                 }
 743
 744                 while (frag) {
 745                         skb = frag->next;
 746                         kfree_skb(frag);
 747                         frag = skb;
 748                 }
 749
 750                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 751                 dst_release(&rt->u.dst);
 752                 return err;
 753         }
 754
 755 slow_path:
 756         left = skb->len - hlen;         /* Space per frame */
 757         ptr = hlen;                     /* Where to start from */
 758
 759         /*
 760          *      Fragment the datagram.
 761          */
 762
 763         *prevhdr = NEXTHDR_FRAGMENT;
 764
 765         /*
 766          *      Keep copying data until we run out.
 767          */
 768         while(left > 0) {
 769                 len = left;
 770                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 771                 if (len > mtu)
 772                         len = mtu;
 773                 /* IF: we are not sending upto and including the packet end
 774                    then align the next start on an eight byte boundary */
 775                 if (len < left) {
 776                         len &= ~7;
 777                 }
 778                 /*
 779                  *      Allocate buffer.
 780                  */
 781
 782                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 783                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 784                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 785                                       IPSTATS_MIB_FRAGFAILS);
 786                         err = -ENOMEM;
 787                         goto fail;
 788                 }
 789
 790                 /*
 791                  *      Set up data on packet
 792                  */
 793
 794                 ip6_copy_metadata(frag, skb);
 795                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 796                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 797                 skb_reset_network_header(frag);
 798                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 799                 frag->transport_header = (frag->network_header + hlen +
 800                                           sizeof(struct frag_hdr));
 801
 802                 /*
 803                  *      Charge the memory for the fragment to any owner
 804                  *      it might possess
 805                  */
 806                 if (skb->sk)
 807                         skb_set_owner_w(frag, skb->sk);
 808
 809                 /*
 810                  *      Copy the packet header into the new buffer.
 811                  */
 812                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 813
 814                 /*
 815                  *      Build fragment header.
 816                  */
 817                 fh->nexthdr = nexthdr;
 818                 fh->reserved = 0;
 819                 if (!frag_id) {
 820                         ipv6_select_ident(skb, fh);
 821                         frag_id = fh->identification;
 822                 } else
 823                         fh->identification = frag_id;
 824
 825                 /*
 826                  *      Copy a block of the IP datagram.
 827                  */
 828                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 829                         BUG();
 830                 left -= len;
 831
 832                 fh->frag_off = htons(offset);
 833                 if (left > 0)
 834                         fh->frag_off |= htons(IP6_MF);
 835                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 836                                                     sizeof(struct ipv6hdr));
 837
 838                 ptr += len;
 839                 offset += len;
 840
 841                 /*
 842                  *      Put this fragment into the sending queue.
 843                  */
 844                 err = output(frag);
 845                 if (err)
 846                         goto fail;
 847
 848                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 849         }
 850         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 851                       IPSTATS_MIB_FRAGOKS);
 852         kfree_skb(skb);
 853         return err;
 854
 855 fail:
 856         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 857                       IPSTATS_MIB_FRAGFAILS);
 858         kfree_skb(skb);
 859         return err;
 860 }
 861
 862 static inline int ip6_rt_check(struct rt6key *rt_key,
 863                                struct in6_addr *fl_addr,
 864                                struct in6_addr *addr_cache)
 865 {
 866         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 867                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 868 }
 869
 870 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 871                                           struct dst_entry *dst,
 872                                           struct flowi *fl)
 873 {
 874         struct ipv6_pinfo *np = inet6_sk(sk);
 875         struct rt6_info *rt = (struct rt6_info *)dst;
 876
 877         if (!dst)
 878                 goto out;
 879
 880         /* Yes, checking route validity in not connected
 881          * case is not very simple. Take into account,
 882          * that we do not support routing by source, TOS,
 883          * and MSG_DONTROUTE            --ANK (980726)
 884          *
 885          * 1. ip6_rt_check(): If route was host route,
 886          *    check that cached destination is current.
 887          *    If it is network route, we still may
 888          *    check its validity using saved pointer
 889          *    to the last used address: daddr_cache.
 890          *    We do not want to save whole address now,
 891          *    (because main consumer of this service
 892          *    is tcp, which has not this problem),
 893          *    so that the last trick works only on connected
 894          *    sockets.
 895          * 2. oif also should be the same.
 896          */
 897         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 898 #ifdef CONFIG_IPV6_SUBTREES
 899             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 900 #endif
 901             (fl->oif && fl->oif != dst->dev->ifindex)) {
 902                 dst_release(dst);
 903                 dst = NULL;
 904         }
 905
 906 out:
 907         return dst;
 908 }
 909
 910 static int ip6_dst_lookup_tail(struct sock *sk,
 911                                struct dst_entry **dst, struct flowi *fl)
 912 {
 913         int err;
 914
 915         if (*dst == NULL)
 916                 *dst = ip6_route_output(sk, fl);
 917
 918         if ((err = (*dst)->error))
 919                 goto out_err_release;
 920
 921         if (ipv6_addr_any(&fl->fl6_src)) {
 922                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 923                 if (err)
 924                         goto out_err_release;
 925         }
 926
 927 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 928                 /*
 929                  * Here if the dst entry we've looked up
 930                  * has a neighbour entry that is in the INCOMPLETE
 931                  * state and the src address from the flow is
 932                  * marked as OPTIMISTIC, we release the found
 933                  * dst entry and replace it instead with the
 934                  * dst entry of the nexthop router
 935                  */
 936                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
 937                         struct inet6_ifaddr *ifp;
 938                         struct flowi fl_gw;
 939                         int redirect;
 940
 941                         ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src,
 942                                               (*dst)->dev, 1);
 943
 944                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 945                         if (ifp)
 946                                 in6_ifa_put(ifp);
 947
 948                         if (redirect) {
 949                                 /*
 950                                  * We need to get the dst entry for the
 951                                  * default router instead
 952                                  */
 953                                 dst_release(*dst);
 954                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
 955                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 956                                 *dst = ip6_route_output(sk, &fl_gw);
 957                                 if ((err = (*dst)->error))
 958                                         goto out_err_release;
 959                         }
 960                 }
 961 #endif
 962
 963         return 0;
 964
 965 out_err_release:
 966         if (err == -ENETUNREACH)
 967                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 968         dst_release(*dst);
 969         *dst = NULL;
 970         return err;
 971 }
 972
 973 /**
 974  *      ip6_dst_lookup - perform route lookup on flow
 975  *      @sk: socket which provides route info
 976  *      @dst: pointer to dst_entry * for result
 977  *      @fl: flow to lookup
 978  *
 979  *      This function performs a route lookup on the given flow.
 980  *
 981  *      It returns zero on success, or a standard errno code on error.
 982  */
 983 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 984 {
 985         *dst = NULL;
 986         return ip6_dst_lookup_tail(sk, dst, fl);
 987 }
 988 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 989
 990 /**
 991  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
 992  *      @sk: socket which provides the dst cache and route info
 993  *      @dst: pointer to dst_entry * for result
 994  *      @fl: flow to lookup
 995  *
 996  *      This function performs a route lookup on the given flow with the
 997  *      possibility of using the cached route in the socket if it is valid.
 998  *      It will take the socket dst lock when operating on the dst cache.
 999  *      As a result, this function can only be used in process context.
1000  *
1001  *      It returns zero on success, or a standard errno code on error.
1002  */
1003 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1004 {
1005         *dst = NULL;
1006         if (sk) {
1007                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1008                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1009         }
1010
1011         return ip6_dst_lookup_tail(sk, dst, fl);
1012 }
1013 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1014
1015 static inline int ip6_ufo_append_data(struct sock *sk,
1016                         int getfrag(void *from, char *to, int offset, int len,
1017                         int odd, struct sk_buff *skb),
1018                         void *from, int length, int hh_len, int fragheaderlen,
1019                         int transhdrlen, int mtu,unsigned int flags)
1020
1021 {
1022         struct sk_buff *skb;
1023         int err;
1024
1025         /* There is support for UDP large send offload by network
1026          * device, so create one single skb packet containing complete
1027          * udp datagram
1028          */
1029         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1030                 skb = sock_alloc_send_skb(sk,
1031                         hh_len + fragheaderlen + transhdrlen + 20,
1032                         (flags & MSG_DONTWAIT), &err);
1033                 if (skb == NULL)
1034                         return -ENOMEM;
1035
1036                 /* reserve space for Hardware header */
1037                 skb_reserve(skb, hh_len);
1038
1039                 /* create space for UDP/IP header */
1040                 skb_put(skb,fragheaderlen + transhdrlen);
1041
1042                 /* initialize network header pointer */
1043                 skb_reset_network_header(skb);
1044
1045                 /* initialize protocol header pointer */
1046                 skb->transport_header = skb->network_header + fragheaderlen;
1047
1048                 skb->ip_summed = CHECKSUM_PARTIAL;
1049                 skb->csum = 0;
1050                 sk->sk_sndmsg_off = 0;
1051         }
1052
1053         err = skb_append_datato_frags(sk,skb, getfrag, from,
1054                                       (length - transhdrlen));
1055         if (!err) {
1056                 struct frag_hdr fhdr;
1057
1058                 /* specify the length of each IP datagram fragment*/
1059                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1060                                             sizeof(struct frag_hdr);
1061                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1062                 ipv6_select_ident(skb, &fhdr);
1063                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1064                 __skb_queue_tail(&sk->sk_write_queue, skb);
1065
1066                 return 0;
1067         }
1068         /* There is not enough support do UPD LSO,
1069          * so follow normal path
1070          */
1071         kfree_skb(skb);
1072
1073         return err;
1074 }
1075
1076 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1077         int offset, int len, int odd, struct sk_buff *skb),
1078         void *from, int length, int transhdrlen,
1079         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1080         struct rt6_info *rt, unsigned int flags)
1081 {
1082         struct inet_sock *inet = inet_sk(sk);
1083         struct ipv6_pinfo *np = inet6_sk(sk);
1084         struct sk_buff *skb;
1085         unsigned int maxfraglen, fragheaderlen;
1086         int exthdrlen;
1087         int hh_len;
1088         int mtu;
1089         int copy;
1090         int err;
1091         int offset = 0;
1092         int csummode = CHECKSUM_NONE;
1093
1094         if (flags&MSG_PROBE)
1095                 return 0;
1096         if (skb_queue_empty(&sk->sk_write_queue)) {
1097                 /*
1098                  * setup for corking
1099                  */
1100                 if (opt) {
1101                         if (np->cork.opt == NULL) {
1102                                 np->cork.opt = kmalloc(opt->tot_len,
1103                                                        sk->sk_allocation);
1104                                 if (unlikely(np->cork.opt == NULL))
1105                                         return -ENOBUFS;
1106                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1107                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1108                                 return -EINVAL;
1109                         }
1110                         memcpy(np->cork.opt, opt, opt->tot_len);
1111                         inet->cork.flags |= IPCORK_OPT;
1112                         /* need source address above miyazawa*/
1113                 }
1114                 dst_hold(&rt->u.dst);
1115                 np->cork.rt = rt;
1116                 inet->cork.fl = *fl;
1117                 np->cork.hop_limit = hlimit;
1118                 np->cork.tclass = tclass;
1119                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1120                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1121                 if (np->frag_size < mtu) {
1122                         if (np->frag_size)
1123                                 mtu = np->frag_size;
1124                 }
1125                 inet->cork.fragsize = mtu;
1126                 if (dst_allfrag(rt->u.dst.path))
1127                         inet->cork.flags |= IPCORK_ALLFRAG;
1128                 inet->cork.length = 0;
1129                 sk->sk_sndmsg_page = NULL;
1130                 sk->sk_sndmsg_off = 0;
1131                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1132                             rt->rt6i_nfheader_len;
1133                 length += exthdrlen;
1134                 transhdrlen += exthdrlen;
1135         } else {
1136                 rt = np->cork.rt;
1137                 fl = &inet->cork.fl;
1138                 if (inet->cork.flags & IPCORK_OPT)
1139                         opt = np->cork.opt;
1140                 transhdrlen = 0;
1141                 exthdrlen = 0;
1142                 mtu = inet->cork.fragsize;
1143         }
1144
1145         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1146
1147         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1148                         (opt ? opt->opt_nflen : 0);
1149         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1150
1151         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1152                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1153                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1154                         return -EMSGSIZE;
1155                 }
1156         }
1157
1158         /*
1159          * Let's try using as much space as possible.
1160          * Use MTU if total length of the message fits into the MTU.
1161          * Otherwise, we need to reserve fragment header and
1162          * fragment alignment (= 8-15 octects, in total).
1163          *
1164          * Note that we may need to "move" the data from the tail of
1165          * of the buffer to the new fragment when we split
1166          * the message.
1167          *
1168          * FIXME: It may be fragmented into multiple chunks
1169          *        at once if non-fragmentable extension headers
1170          *        are too large.
1171          * --yoshfuji
1172          */
1173
1174         inet->cork.length += length;
1175         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1176             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1177
1178                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1179                                           fragheaderlen, transhdrlen, mtu,
1180                                           flags);
1181                 if (err)
1182                         goto error;
1183                 return 0;
1184         }
1185
1186         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1187                 goto alloc_new_skb;
1188
1189         while (length > 0) {
1190                 /* Check if the remaining data fits into current packet. */
1191                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1192                 if (copy < length)
1193                         copy = maxfraglen - skb->len;
1194
1195                 if (copy <= 0) {
1196                         char *data;
1197                         unsigned int datalen;
1198                         unsigned int fraglen;
1199                         unsigned int fraggap;
1200                         unsigned int alloclen;
1201                         struct sk_buff *skb_prev;
1202 alloc_new_skb:
1203                         skb_prev = skb;
1204
1205                         /* There's no room in the current skb */
1206                         if (skb_prev)
1207                                 fraggap = skb_prev->len - maxfraglen;
1208                         else
1209                                 fraggap = 0;
1210
1211                         /*
1212                          * If remaining data exceeds the mtu,
1213                          * we know we need more fragment(s).
1214                          */
1215                         datalen = length + fraggap;
1216                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1217                                 datalen = maxfraglen - fragheaderlen;
1218
1219                         fraglen = datalen + fragheaderlen;
1220                         if ((flags & MSG_MORE) &&
1221                             !(rt->u.dst.dev->features&NETIF_F_SG))
1222                                 alloclen = mtu;
1223                         else
1224                                 alloclen = datalen + fragheaderlen;
1225
1226                         /*
1227                          * The last fragment gets additional space at tail.
1228                          * Note: we overallocate on fragments with MSG_MODE
1229                          * because we have no idea if we're the last one.
1230                          */
1231                         if (datalen == length + fraggap)
1232                                 alloclen += rt->u.dst.trailer_len;
1233
1234                         /*
1235                          * We just reserve space for fragment header.
1236                          * Note: this may be overallocation if the message
1237                          * (without MSG_MORE) fits into the MTU.
1238                          */
1239                         alloclen += sizeof(struct frag_hdr);
1240
1241                         if (transhdrlen) {
1242                                 skb = sock_alloc_send_skb(sk,
1243                                                 alloclen + hh_len,
1244                                                 (flags & MSG_DONTWAIT), &err);
1245                         } else {
1246                                 skb = NULL;
1247                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1248                                     2 * sk->sk_sndbuf)
1249                                         skb = sock_wmalloc(sk,
1250                                                            alloclen + hh_len, 1,
1251                                                            sk->sk_allocation);
1252                                 if (unlikely(skb == NULL))
1253                                         err = -ENOBUFS;
1254                         }
1255                         if (skb == NULL)
1256                                 goto error;
1257                         /*
1258                          *      Fill in the control structures
1259                          */
1260                         skb->ip_summed = csummode;
1261                         skb->csum = 0;
1262                         /* reserve for fragmentation */
1263                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1264
1265                         /*
1266                          *      Find where to start putting bytes
1267                          */
1268                         data = skb_put(skb, fraglen);
1269                         skb_set_network_header(skb, exthdrlen);
1270                         data += fragheaderlen;
1271                         skb->transport_header = (skb->network_header +
1272                                                  fragheaderlen);
1273                         if (fraggap) {
1274                                 skb->csum = skb_copy_and_csum_bits(
1275                                         skb_prev, maxfraglen,
1276                                         data + transhdrlen, fraggap, 0);
1277                                 skb_prev->csum = csum_sub(skb_prev->csum,
1278                                                           skb->csum);
1279                                 data += fraggap;
1280                                 pskb_trim_unique(skb_prev, maxfraglen);
1281                         }
1282                         copy = datalen - transhdrlen - fraggap;
1283                         if (copy < 0) {
1284                                 err = -EINVAL;
1285                                 kfree_skb(skb);
1286                                 goto error;
1287                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1288                                 err = -EFAULT;
1289                                 kfree_skb(skb);
1290                                 goto error;
1291                         }
1292
1293                         offset += copy;
1294                         length -= datalen - fraggap;
1295                         transhdrlen = 0;
1296                         exthdrlen = 0;
1297                         csummode = CHECKSUM_NONE;
1298
1299                         /*
1300                          * Put the packet on the pending queue
1301                          */
1302                         __skb_queue_tail(&sk->sk_write_queue, skb);
1303                         continue;
1304                 }
1305
1306                 if (copy > length)
1307                         copy = length;
1308
1309                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1310                         unsigned int off;
1311
1312                         off = skb->len;
1313                         if (getfrag(from, skb_put(skb, copy),
1314                                                 offset, copy, off, skb) < 0) {
1315                                 __skb_trim(skb, off);
1316                                 err = -EFAULT;
1317                                 goto error;
1318                         }
1319                 } else {
1320                         int i = skb_shinfo(skb)->nr_frags;
1321                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1322                         struct page *page = sk->sk_sndmsg_page;
1323                         int off = sk->sk_sndmsg_off;
1324                         unsigned int left;
1325
1326                         if (page && (left = PAGE_SIZE - off) > 0) {
1327                                 if (copy >= left)
1328                                         copy = left;
1329                                 if (page != frag->page) {
1330                                         if (i == MAX_SKB_FRAGS) {
1331                                                 err = -EMSGSIZE;
1332                                                 goto error;
1333                                         }
1334                                         get_page(page);
1335                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1336                                         frag = &skb_shinfo(skb)->frags[i];
1337                                 }
1338                         } else if(i < MAX_SKB_FRAGS) {
1339                                 if (copy > PAGE_SIZE)
1340                                         copy = PAGE_SIZE;
1341                                 page = alloc_pages(sk->sk_allocation, 0);
1342                                 if (page == NULL) {
1343                                         err = -ENOMEM;
1344                                         goto error;
1345                                 }
1346                                 sk->sk_sndmsg_page = page;
1347                                 sk->sk_sndmsg_off = 0;
1348
1349                                 skb_fill_page_desc(skb, i, page, 0, 0);
1350                                 frag = &skb_shinfo(skb)->frags[i];
1351                         } else {
1352                                 err = -EMSGSIZE;
1353                                 goto error;
1354                         }
1355                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1356                                 err = -EFAULT;
1357                                 goto error;
1358                         }
1359                         sk->sk_sndmsg_off += copy;
1360                         frag->size += copy;
1361                         skb->len += copy;
1362                         skb->data_len += copy;
1363                         skb->truesize += copy;
1364                         atomic_add(copy, &sk->sk_wmem_alloc);
1365                 }
1366                 offset += copy;
1367                 length -= copy;
1368         }
1369         return 0;
1370 error:
1371         inet->cork.length -= length;
1372         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1373         return err;
1374 }
1375
1376 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1377 {
1378         inet->cork.flags &= ~IPCORK_OPT;
1379         kfree(np->cork.opt);
1380         np->cork.opt = NULL;
1381         if (np->cork.rt) {
1382                 dst_release(&np->cork.rt->u.dst);
1383                 np->cork.rt = NULL;
1384                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1385         }
1386         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1387 }
1388
1389 int ip6_push_pending_frames(struct sock *sk)
1390 {
1391         struct sk_buff *skb, *tmp_skb;
1392         struct sk_buff **tail_skb;
1393         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1394         struct inet_sock *inet = inet_sk(sk);
1395         struct ipv6_pinfo *np = inet6_sk(sk);
1396         struct ipv6hdr *hdr;
1397         struct ipv6_txoptions *opt = np->cork.opt;
1398         struct rt6_info *rt = np->cork.rt;
1399         struct flowi *fl = &inet->cork.fl;
1400         unsigned char proto = fl->proto;
1401         int err = 0;
1402
1403         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1404                 goto out;
1405         tail_skb = &(skb_shinfo(skb)->frag_list);
1406
1407         /* move skb->data to ip header from ext header */
1408         if (skb->data < skb_network_header(skb))
1409                 __skb_pull(skb, skb_network_offset(skb));
1410         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1411                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1412                 *tail_skb = tmp_skb;
1413                 tail_skb = &(tmp_skb->next);
1414                 skb->len += tmp_skb->len;
1415                 skb->data_len += tmp_skb->len;
1416                 skb->truesize += tmp_skb->truesize;
1417                 __sock_put(tmp_skb->sk);
1418                 tmp_skb->destructor = NULL;
1419                 tmp_skb->sk = NULL;
1420         }
1421
1422         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1423         __skb_pull(skb, skb_network_header_len(skb));
1424         if (opt && opt->opt_flen)
1425                 ipv6_push_frag_opts(skb, opt, &proto);
1426         if (opt && opt->opt_nflen)
1427                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1428
1429         skb_push(skb, sizeof(struct ipv6hdr));
1430         skb_reset_network_header(skb);
1431         hdr = ipv6_hdr(skb);
1432
1433         *(__be32*)hdr = fl->fl6_flowlabel |
1434                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1435
1436         hdr->hop_limit = np->cork.hop_limit;
1437         hdr->nexthdr = proto;
1438         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1439         ipv6_addr_copy(&hdr->daddr, final_dst);
1440
1441         skb->priority = sk->sk_priority;
1442
1443         skb->dst = dst_clone(&rt->u.dst);
1444         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1445         if (proto == IPPROTO_ICMPV6) {
1446                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1447
1448                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1449                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1450         }
1451
1452         err = ip6_local_out(skb);
1453         if (err) {
1454                 if (err > 0)
1455                         err = np->recverr ? net_xmit_errno(err) : 0;
1456                 if (err)
1457                         goto error;
1458         }
1459
1460 out:
1461         ip6_cork_release(inet, np);
1462         return err;
1463 error:
1464         goto out;
1465 }
1466
1467 void ip6_flush_pending_frames(struct sock *sk)
1468 {
1469         struct sk_buff *skb;
1470
1471         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1472                 if (skb->dst)
1473                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1474                                       IPSTATS_MIB_OUTDISCARDS);
1475                 kfree_skb(skb);
1476         }
1477
1478         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1479 }