net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/errno.h>
  33 #include <linux/types.h>
  34 #include <linux/string.h>
  35 #include <linux/socket.h>
  36 #include <linux/net.h>
  37 #include <linux/netdevice.h>
  38 #include <linux/if_arp.h>
  39 #include <linux/in6.h>
  40 #include <linux/tcp.h>
  41 #include <linux/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 static inline int ip6_output_finish(struct sk_buff *skb)
  74 {
  75
  76         struct dst_entry *dst = skb->dst;
  77         struct hh_cache *hh = dst->hh;
  78
  79         if (hh) {
  80                 int hh_alen;
  81
  82                 read_lock_bh(&hh->hh_lock);
  83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
  84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
  85                 read_unlock_bh(&hh->hh_lock);
  86                 skb_push(skb, hh->hh_len);
  87                 return hh->hh_output(skb);
  88         } else if (dst->neighbour)
  89                 return dst->neighbour->output(skb);
  90
  91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
  92         kfree_skb(skb);
  93         return -EINVAL;
  94
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110
 111 static int ip6_output2(struct sk_buff *skb)
 112 {
 113         struct dst_entry *dst = skb->dst;
 114         struct net_device *dev = dst->dev;
 115
 116         skb->protocol = htons(ETH_P_IPV6);
 117         skb->dev = dev;
 118
 119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
 120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 121
 122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
 124                                 &skb->nh.ipv6h->saddr)) {
 125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 126
 127                         /* Do not check for IFF_ALLMULTI; multicast routing
 128                            is not supported in any case.
 129                          */
 130                         if (newskb)
 131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
 132                                         newskb->dev,
 133                                         ip6_dev_loopback_xmit);
 134
 135                         if (skb->nh.ipv6h->hop_limit == 0) {
 136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 137                                 kfree_skb(skb);
 138                                 return 0;
 139                         }
 140                 }
 141
 142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 143         }
 144
 145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 146 }
 147
 148 int ip6_output(struct sk_buff *skb)
 149 {
 150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
 151                 return ip6_fragment(skb, ip6_output2);
 152         else
 153                 return ip6_output2(skb);
 154 }
 155
 156 /*
 157  *      xmit an sk_buff (used by TCP)
 158  */
 159
 160 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 161              struct ipv6_txoptions *opt, int ipfragok)
 162 {
 163         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
 164         struct in6_addr *first_hop = &fl->fl6_dst;
 165         struct dst_entry *dst = skb->dst;
 166         struct ipv6hdr *hdr;
 167         u8  proto = fl->proto;
 168         int seg_len = skb->len;
 169         int hlimit;
 170         u32 mtu;
 171
 172         if (opt) {
 173                 int head_room;
 174
 175                 /* First: exthdrs may take lots of space (~8K for now)
 176                    MAX_HEADER is not enough.
 177                  */
 178                 head_room = opt->opt_nflen + opt->opt_flen;
 179                 seg_len += head_room;
 180                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 181
 182                 if (skb_headroom(skb) < head_room) {
 183                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 184                         kfree_skb(skb);
 185                         skb = skb2;
 186                         if (skb == NULL) {
 187                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 188                                 return -ENOBUFS;
 189                         }
 190                         if (sk)
 191                                 skb_set_owner_w(skb, sk);
 192                 }
 193                 if (opt->opt_flen)
 194                         ipv6_push_frag_opts(skb, opt, &proto);
 195                 if (opt->opt_nflen)
 196                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 197         }
 198
 199         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 200
 201         /*
 202          *      Fill in the IPv6 header
 203          */
 204
 205         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
 206         hlimit = -1;
 207         if (np)
 208                 hlimit = np->hop_limit;
 209         if (hlimit < 0)
 210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 211         if (hlimit < 0)
 212                 hlimit = ipv6_get_hoplimit(dst->dev);
 213
 214         hdr->payload_len = htons(seg_len);
 215         hdr->nexthdr = proto;
 216         hdr->hop_limit = hlimit;
 217
 218         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 219         ipv6_addr_copy(&hdr->daddr, first_hop);
 220
 221         mtu = dst_mtu(dst);
 222         if ((skb->len <= mtu) || ipfragok) {
 223                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 224                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
 225                                 dst_output);
 226         }
 227
 228         if (net_ratelimit())
 229                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 230         skb->dev = dst->dev;
 231         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 232         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 233         kfree_skb(skb);
 234         return -EMSGSIZE;
 235 }
 236
 237 /*
 238  *      To avoid extra problems ND packets are send through this
 239  *      routine. It's code duplication but I really want to avoid
 240  *      extra checks since ipv6_build_header is used by TCP (which
 241  *      is for us performance critical)
 242  */
 243
 244 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 245                struct in6_addr *saddr, struct in6_addr *daddr,
 246                int proto, int len)
 247 {
 248         struct ipv6_pinfo *np = inet6_sk(sk);
 249         struct ipv6hdr *hdr;
 250         int totlen;
 251
 252         skb->protocol = htons(ETH_P_IPV6);
 253         skb->dev = dev;
 254
 255         totlen = len + sizeof(struct ipv6hdr);
 256
 257         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 258         skb->nh.ipv6h = hdr;
 259
 260         *(u32*)hdr = htonl(0x60000000);
 261
 262         hdr->payload_len = htons(len);
 263         hdr->nexthdr = proto;
 264         hdr->hop_limit = np->hop_limit;
 265
 266         ipv6_addr_copy(&hdr->saddr, saddr);
 267         ipv6_addr_copy(&hdr->daddr, daddr);
 268
 269         return 0;
 270 }
 271
 272 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 273 {
 274         struct ip6_ra_chain *ra;
 275         struct sock *last = NULL;
 276
 277         read_lock(&ip6_ra_lock);
 278         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 279                 struct sock *sk = ra->sk;
 280                 if (sk && ra->sel == sel) {
 281                         if (last) {
 282                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 283                                 if (skb2)
 284                                         rawv6_rcv(last, skb2);
 285                         }
 286                         last = sk;
 287                 }
 288         }
 289
 290         if (last) {
 291                 rawv6_rcv(last, skb);
 292                 read_unlock(&ip6_ra_lock);
 293                 return 1;
 294         }
 295         read_unlock(&ip6_ra_lock);
 296         return 0;
 297 }
 298
 299 static inline int ip6_forward_finish(struct sk_buff *skb)
 300 {
 301         return dst_output(skb);
 302 }
 303
 304 int ip6_forward(struct sk_buff *skb)
 305 {
 306         struct dst_entry *dst = skb->dst;
 307         struct ipv6hdr *hdr = skb->nh.ipv6h;
 308         struct inet6_skb_parm *opt = IP6CB(skb);
 309
 310         if (ipv6_devconf.forwarding == 0)
 311                 goto error;
 312
 313         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 314                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 315                 goto drop;
 316         }
 317
 318         skb->ip_summed = CHECKSUM_NONE;
 319
 320         /*
 321          *      We DO NOT make any processing on
 322          *      RA packets, pushing them to user level AS IS
 323          *      without ane WARRANTY that application will be able
 324          *      to interpret them. The reason is that we
 325          *      cannot make anything clever here.
 326          *
 327          *      We are not end-node, so that if packet contains
 328          *      AH/ESP, we cannot make anything.
 329          *      Defragmentation also would be mistake, RA packets
 330          *      cannot be fragmented, because there is no warranty
 331          *      that different fragments will go along one path. --ANK
 332          */
 333         if (opt->ra) {
 334                 u8 *ptr = skb->nh.raw + opt->ra;
 335                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 336                         return 0;
 337         }
 338
 339         /*
 340          *      check and decrement ttl
 341          */
 342         if (hdr->hop_limit <= 1) {
 343                 /* Force OUTPUT device used as source address */
 344                 skb->dev = dst->dev;
 345                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 346                             0, skb->dev);
 347
 348                 kfree_skb(skb);
 349                 return -ETIMEDOUT;
 350         }
 351
 352         if (!xfrm6_route_forward(skb)) {
 353                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 354                 goto drop;
 355         }
 356         dst = skb->dst;
 357
 358         /* IPv6 specs say nothing about it, but it is clear that we cannot
 359            send redirects to source routed frames.
 360          */
 361         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 362                 struct in6_addr *target = NULL;
 363                 struct rt6_info *rt;
 364                 struct neighbour *n = dst->neighbour;
 365
 366                 /*
 367                  *      incoming and outgoing devices are the same
 368                  *      send a redirect.
 369                  */
 370
 371                 rt = (struct rt6_info *) dst;
 372                 if ((rt->rt6i_flags & RTF_GATEWAY))
 373                         target = (struct in6_addr*)&n->primary_key;
 374                 else
 375                         target = &hdr->daddr;
 376
 377                 /* Limit redirects both by destination (here)
 378                    and by source (inside ndisc_send_redirect)
 379                  */
 380                 if (xrlim_allow(dst, 1*HZ))
 381                         ndisc_send_redirect(skb, n, target);
 382         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
 383                                                 |IPV6_ADDR_LINKLOCAL)) {
 384                 /* This check is security critical. */
 385                 goto error;
 386         }
 387
 388         if (skb->len > dst_mtu(dst)) {
 389                 /* Again, force OUTPUT device used as source address */
 390                 skb->dev = dst->dev;
 391                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 392                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 393                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 394                 kfree_skb(skb);
 395                 return -EMSGSIZE;
 396         }
 397
 398         if (skb_cow(skb, dst->dev->hard_header_len)) {
 399                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 400                 goto drop;
 401         }
 402
 403         hdr = skb->nh.ipv6h;
 404
 405         /* Mangling hops number delayed to point after skb COW */
 406
 407         hdr->hop_limit--;
 408
 409         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 410         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 411
 412 error:
 413         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 414 drop:
 415         kfree_skb(skb);
 416         return -EINVAL;
 417 }
 418
 419 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 420 {
 421         to->pkt_type = from->pkt_type;
 422         to->priority = from->priority;
 423         to->protocol = from->protocol;
 424         dst_release(to->dst);
 425         to->dst = dst_clone(from->dst);
 426         to->dev = from->dev;
 427
 428 #ifdef CONFIG_NET_SCHED
 429         to->tc_index = from->tc_index;
 430 #endif
 431 #ifdef CONFIG_NETFILTER
 432         to->nfmark = from->nfmark;
 433         /* Connection association is same as pre-frag packet */
 434         to->nfct = from->nfct;
 435         nf_conntrack_get(to->nfct);
 436         to->nfctinfo = from->nfctinfo;
 437 #ifdef CONFIG_BRIDGE_NETFILTER
 438         nf_bridge_put(to->nf_bridge);
 439         to->nf_bridge = from->nf_bridge;
 440         nf_bridge_get(to->nf_bridge);
 441 #endif
 442 #endif
 443 }
 444
 445 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 446 {
 447         u16 offset = sizeof(struct ipv6hdr);
 448         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
 449         unsigned int packet_len = skb->tail - skb->nh.raw;
 450         int found_rhdr = 0;
 451         *nexthdr = &skb->nh.ipv6h->nexthdr;
 452
 453         while (offset + 1 <= packet_len) {
 454
 455                 switch (**nexthdr) {
 456
 457                 case NEXTHDR_HOP:
 458                 case NEXTHDR_ROUTING:
 459                 case NEXTHDR_DEST:
 460                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
 461                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
 462                         offset += ipv6_optlen(exthdr);
 463                         *nexthdr = &exthdr->nexthdr;
 464                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
 465                         break;
 466                 default :
 467                         return offset;
 468                 }
 469         }
 470
 471         return offset;
 472 }
 473
 474 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 475 {
 476         struct net_device *dev;
 477         struct sk_buff *frag;
 478         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 479         struct ipv6hdr *tmp_hdr;
 480         struct frag_hdr *fh;
 481         unsigned int mtu, hlen, left, len;
 482         u32 frag_id = 0;
 483         int ptr, offset = 0, err=0;
 484         u8 *prevhdr, nexthdr = 0;
 485
 486         dev = rt->u.dst.dev;
 487         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 488         nexthdr = *prevhdr;
 489
 490         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 491
 492         if (skb_shinfo(skb)->frag_list) {
 493                 int first_len = skb_pagelen(skb);
 494
 495                 if (first_len - hlen > mtu ||
 496                     ((first_len - hlen) & 7) ||
 497                     skb_cloned(skb))
 498                         goto slow_path;
 499
 500                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 501                         /* Correct geometry. */
 502                         if (frag->len > mtu ||
 503                             ((frag->len & 7) && frag->next) ||
 504                             skb_headroom(frag) < hlen)
 505                             goto slow_path;
 506
 507                         /* Partially cloned skb? */
 508                         if (skb_shared(frag))
 509                                 goto slow_path;
 510
 511                         BUG_ON(frag->sk);
 512                         if (skb->sk) {
 513                                 sock_hold(skb->sk);
 514                                 frag->sk = skb->sk;
 515                                 frag->destructor = sock_wfree;
 516                                 skb->truesize -= frag->truesize;
 517                         }
 518                 }
 519
 520                 err = 0;
 521                 offset = 0;
 522                 frag = skb_shinfo(skb)->frag_list;
 523                 skb_shinfo(skb)->frag_list = NULL;
 524                 /* BUILD HEADER */
 525
 526                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
 527                 if (!tmp_hdr) {
 528                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 529                         return -ENOMEM;
 530                 }
 531
 532                 *prevhdr = NEXTHDR_FRAGMENT;
 533                 memcpy(tmp_hdr, skb->nh.raw, hlen);
 534                 __skb_pull(skb, hlen);
 535                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 536                 skb->nh.raw = __skb_push(skb, hlen);
 537                 memcpy(skb->nh.raw, tmp_hdr, hlen);
 538
 539                 ipv6_select_ident(skb, fh);
 540                 fh->nexthdr = nexthdr;
 541                 fh->reserved = 0;
 542                 fh->frag_off = htons(IP6_MF);
 543                 frag_id = fh->identification;
 544
 545                 first_len = skb_pagelen(skb);
 546                 skb->data_len = first_len - skb_headlen(skb);
 547                 skb->len = first_len;
 548                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 549
 550
 551                 for (;;) {
 552                         /* Prepare header of the next frame,
 553                          * before previous one went down. */
 554                         if (frag) {
 555                                 frag->ip_summed = CHECKSUM_NONE;
 556                                 frag->h.raw = frag->data;
 557                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 558                                 frag->nh.raw = __skb_push(frag, hlen);
 559                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
 560                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 561                                 fh->nexthdr = nexthdr;
 562                                 fh->reserved = 0;
 563                                 fh->frag_off = htons(offset);
 564                                 if (frag->next != NULL)
 565                                         fh->frag_off |= htons(IP6_MF);
 566                                 fh->identification = frag_id;
 567                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 568                                 ip6_copy_metadata(frag, skb);
 569                         }
 570
 571                         err = output(skb);
 572                         if (err || !frag)
 573                                 break;
 574
 575                         skb = frag;
 576                         frag = skb->next;
 577                         skb->next = NULL;
 578                 }
 579
 580                 if (tmp_hdr)
 581                         kfree(tmp_hdr);
 582
 583                 if (err == 0) {
 584                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 585                         return 0;
 586                 }
 587
 588                 while (frag) {
 589                         skb = frag->next;
 590                         kfree_skb(frag);
 591                         frag = skb;
 592                 }
 593
 594                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 595                 return err;
 596         }
 597
 598 slow_path:
 599         left = skb->len - hlen;         /* Space per frame */
 600         ptr = hlen;                     /* Where to start from */
 601
 602         /*
 603          *      Fragment the datagram.
 604          */
 605
 606         *prevhdr = NEXTHDR_FRAGMENT;
 607
 608         /*
 609          *      Keep copying data until we run out.
 610          */
 611         while(left > 0) {
 612                 len = left;
 613                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 614                 if (len > mtu)
 615                         len = mtu;
 616                 /* IF: we are not sending upto and including the packet end
 617                    then align the next start on an eight byte boundary */
 618                 if (len < left) {
 619                         len &= ~7;
 620                 }
 621                 /*
 622                  *      Allocate buffer.
 623                  */
 624
 625                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 626                         NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
 627                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 628                         err = -ENOMEM;
 629                         goto fail;
 630                 }
 631
 632                 /*
 633                  *      Set up data on packet
 634                  */
 635
 636                 ip6_copy_metadata(frag, skb);
 637                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 638                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 639                 frag->nh.raw = frag->data;
 640                 fh = (struct frag_hdr*)(frag->data + hlen);
 641                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
 642
 643                 /*
 644                  *      Charge the memory for the fragment to any owner
 645                  *      it might possess
 646                  */
 647                 if (skb->sk)
 648                         skb_set_owner_w(frag, skb->sk);
 649
 650                 /*
 651                  *      Copy the packet header into the new buffer.
 652                  */
 653                 memcpy(frag->nh.raw, skb->data, hlen);
 654
 655                 /*
 656                  *      Build fragment header.
 657                  */
 658                 fh->nexthdr = nexthdr;
 659                 fh->reserved = 0;
 660                 if (frag_id) {
 661                         ipv6_select_ident(skb, fh);
 662                         frag_id = fh->identification;
 663                 } else
 664                         fh->identification = frag_id;
 665
 666                 /*
 667                  *      Copy a block of the IP datagram.
 668                  */
 669                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
 670                         BUG();
 671                 left -= len;
 672
 673                 fh->frag_off = htons(offset);
 674                 if (left > 0)
 675                         fh->frag_off |= htons(IP6_MF);
 676                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 677
 678                 ptr += len;
 679                 offset += len;
 680
 681                 /*
 682                  *      Put this fragment into the sending queue.
 683                  */
 684
 685                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 686
 687                 err = output(frag);
 688                 if (err)
 689                         goto fail;
 690         }
 691         kfree_skb(skb);
 692         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 693         return err;
 694
 695 fail:
 696         kfree_skb(skb);
 697         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 698         return err;
 699 }
 700
 701 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 702 {
 703         int err = 0;
 704
 705         *dst = NULL;
 706         if (sk) {
 707                 struct ipv6_pinfo *np = inet6_sk(sk);
 708
 709                 *dst = sk_dst_check(sk, np->dst_cookie);
 710                 if (*dst) {
 711                         struct rt6_info *rt = (struct rt6_info*)*dst;
 712
 713                                 /* Yes, checking route validity in not connected
 714                                    case is not very simple. Take into account,
 715                                    that we do not support routing by source, TOS,
 716                                    and MSG_DONTROUTE            --ANK (980726)
 717
 718                                    1. If route was host route, check that
 719                                       cached destination is current.
 720                                       If it is network route, we still may
 721                                       check its validity using saved pointer
 722                                       to the last used address: daddr_cache.
 723                                       We do not want to save whole address now,
 724                                       (because main consumer of this service
 725                                        is tcp, which has not this problem),
 726                                       so that the last trick works only on connected
 727                                       sockets.
 728                                    2. oif also should be the same.
 729                                  */
 730
 731                         if (((rt->rt6i_dst.plen != 128 ||
 732                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
 733                              && (np->daddr_cache == NULL ||
 734                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
 735                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
 736                                 dst_release(*dst);
 737                                 *dst = NULL;
 738                         }
 739                 }
 740         }
 741
 742         if (*dst == NULL)
 743                 *dst = ip6_route_output(sk, fl);
 744
 745         if ((err = (*dst)->error))
 746                 goto out_err_release;
 747
 748         if (ipv6_addr_any(&fl->fl6_src)) {
 749                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 750
 751                 if (err)
 752                         goto out_err_release;
 753         }
 754
 755         return 0;
 756
 757 out_err_release:
 758         dst_release(*dst);
 759         *dst = NULL;
 760         return err;
 761 }
 762
 763 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
 764                     void *from, int length, int transhdrlen,
 765                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
 766                     unsigned int flags)
 767 {
 768         struct inet_sock *inet = inet_sk(sk);
 769         struct ipv6_pinfo *np = inet6_sk(sk);
 770         struct sk_buff *skb;
 771         unsigned int maxfraglen, fragheaderlen;
 772         int exthdrlen;
 773         int hh_len;
 774         int mtu;
 775         int copy;
 776         int err;
 777         int offset = 0;
 778         int csummode = CHECKSUM_NONE;
 779
 780         if (flags&MSG_PROBE)
 781                 return 0;
 782         if (skb_queue_empty(&sk->sk_write_queue)) {
 783                 /*
 784                  * setup for corking
 785                  */
 786                 if (opt) {
 787                         if (np->cork.opt == NULL) {
 788                                 np->cork.opt = kmalloc(opt->tot_len,
 789                                                        sk->sk_allocation);
 790                                 if (unlikely(np->cork.opt == NULL))
 791                                         return -ENOBUFS;
 792                         } else if (np->cork.opt->tot_len < opt->tot_len) {
 793                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
 794                                 return -EINVAL;
 795                         }
 796                         memcpy(np->cork.opt, opt, opt->tot_len);
 797                         inet->cork.flags |= IPCORK_OPT;
 798                         /* need source address above miyazawa*/
 799                 }
 800                 dst_hold(&rt->u.dst);
 801                 np->cork.rt = rt;
 802                 inet->cork.fl = *fl;
 803                 np->cork.hop_limit = hlimit;
 804                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 805                 if (dst_allfrag(rt->u.dst.path))
 806                         inet->cork.flags |= IPCORK_ALLFRAG;
 807                 inet->cork.length = 0;
 808                 sk->sk_sndmsg_page = NULL;
 809                 sk->sk_sndmsg_off = 0;
 810                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
 811                 length += exthdrlen;
 812                 transhdrlen += exthdrlen;
 813         } else {
 814                 rt = np->cork.rt;
 815                 fl = &inet->cork.fl;
 816                 if (inet->cork.flags & IPCORK_OPT)
 817                         opt = np->cork.opt;
 818                 transhdrlen = 0;
 819                 exthdrlen = 0;
 820                 mtu = inet->cork.fragsize;
 821         }
 822
 823         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 824
 825         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
 826         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 827
 828         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 829                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
 830                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
 831                         return -EMSGSIZE;
 832                 }
 833         }
 834
 835         /*
 836          * Let's try using as much space as possible.
 837          * Use MTU if total length of the message fits into the MTU.
 838          * Otherwise, we need to reserve fragment header and
 839          * fragment alignment (= 8-15 octects, in total).
 840          *
 841          * Note that we may need to "move" the data from the tail of
 842          * of the buffer to the new fragment when we split
 843          * the message.
 844          *
 845          * FIXME: It may be fragmented into multiple chunks
 846          *        at once if non-fragmentable extension headers
 847          *        are too large.
 848          * --yoshfuji
 849          */
 850
 851         inet->cork.length += length;
 852
 853         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 854                 goto alloc_new_skb;
 855
 856         while (length > 0) {
 857                 /* Check if the remaining data fits into current packet. */
 858                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
 859                 if (copy < length)
 860                         copy = maxfraglen - skb->len;
 861
 862                 if (copy <= 0) {
 863                         char *data;
 864                         unsigned int datalen;
 865                         unsigned int fraglen;
 866                         unsigned int fraggap;
 867                         unsigned int alloclen;
 868                         struct sk_buff *skb_prev;
 869 alloc_new_skb:
 870                         skb_prev = skb;
 871
 872                         /* There's no room in the current skb */
 873                         if (skb_prev)
 874                                 fraggap = skb_prev->len - maxfraglen;
 875                         else
 876                                 fraggap = 0;
 877
 878                         /*
 879                          * If remaining data exceeds the mtu,
 880                          * we know we need more fragment(s).
 881                          */
 882                         datalen = length + fraggap;
 883                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 884                                 datalen = maxfraglen - fragheaderlen;
 885
 886                         fraglen = datalen + fragheaderlen;
 887                         if ((flags & MSG_MORE) &&
 888                             !(rt->u.dst.dev->features&NETIF_F_SG))
 889                                 alloclen = mtu;
 890                         else
 891                                 alloclen = datalen + fragheaderlen;
 892
 893                         /*
 894                          * The last fragment gets additional space at tail.
 895                          * Note: we overallocate on fragments with MSG_MODE
 896                          * because we have no idea if we're the last one.
 897                          */
 898                         if (datalen == length + fraggap)
 899                                 alloclen += rt->u.dst.trailer_len;
 900
 901                         /*
 902                          * We just reserve space for fragment header.
 903                          * Note: this may be overallocation if the message
 904                          * (without MSG_MORE) fits into the MTU.
 905                          */
 906                         alloclen += sizeof(struct frag_hdr);
 907
 908                         if (transhdrlen) {
 909                                 skb = sock_alloc_send_skb(sk,
 910                                                 alloclen + hh_len,
 911                                                 (flags & MSG_DONTWAIT), &err);
 912                         } else {
 913                                 skb = NULL;
 914                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 915                                     2 * sk->sk_sndbuf)
 916                                         skb = sock_wmalloc(sk,
 917                                                            alloclen + hh_len, 1,
 918                                                            sk->sk_allocation);
 919                                 if (unlikely(skb == NULL))
 920                                         err = -ENOBUFS;
 921                         }
 922                         if (skb == NULL)
 923                                 goto error;
 924                         /*
 925                          *      Fill in the control structures
 926                          */
 927                         skb->ip_summed = csummode;
 928                         skb->csum = 0;
 929                         /* reserve for fragmentation */
 930                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 931
 932                         /*
 933                          *      Find where to start putting bytes
 934                          */
 935                         data = skb_put(skb, fraglen);
 936                         skb->nh.raw = data + exthdrlen;
 937                         data += fragheaderlen;
 938                         skb->h.raw = data + exthdrlen;
 939
 940                         if (fraggap) {
 941                                 skb->csum = skb_copy_and_csum_bits(
 942                                         skb_prev, maxfraglen,
 943                                         data + transhdrlen, fraggap, 0);
 944                                 skb_prev->csum = csum_sub(skb_prev->csum,
 945                                                           skb->csum);
 946                                 data += fraggap;
 947                                 skb_trim(skb_prev, maxfraglen);
 948                         }
 949                         copy = datalen - transhdrlen - fraggap;
 950                         if (copy < 0) {
 951                                 err = -EINVAL;
 952                                 kfree_skb(skb);
 953                                 goto error;
 954                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 955                                 err = -EFAULT;
 956                                 kfree_skb(skb);
 957                                 goto error;
 958                         }
 959
 960                         offset += copy;
 961                         length -= datalen - fraggap;
 962                         transhdrlen = 0;
 963                         exthdrlen = 0;
 964                         csummode = CHECKSUM_NONE;
 965
 966                         /*
 967                          * Put the packet on the pending queue
 968                          */
 969                         __skb_queue_tail(&sk->sk_write_queue, skb);
 970                         continue;
 971                 }
 972
 973                 if (copy > length)
 974                         copy = length;
 975
 976                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 977                         unsigned int off;
 978
 979                         off = skb->len;
 980                         if (getfrag(from, skb_put(skb, copy),
 981                                                 offset, copy, off, skb) < 0) {
 982                                 __skb_trim(skb, off);
 983                                 err = -EFAULT;
 984                                 goto error;
 985                         }
 986                 } else {
 987                         int i = skb_shinfo(skb)->nr_frags;
 988                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 989                         struct page *page = sk->sk_sndmsg_page;
 990                         int off = sk->sk_sndmsg_off;
 991                         unsigned int left;
 992
 993                         if (page && (left = PAGE_SIZE - off) > 0) {
 994                                 if (copy >= left)
 995                                         copy = left;
 996                                 if (page != frag->page) {
 997                                         if (i == MAX_SKB_FRAGS) {
 998                                                 err = -EMSGSIZE;
 999                                                 goto error;
1000                                         }
1001                                         get_page(page);
1002                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1003                                         frag = &skb_shinfo(skb)->frags[i];
1004                                 }
1005                         } else if(i < MAX_SKB_FRAGS) {
1006                                 if (copy > PAGE_SIZE)
1007                                         copy = PAGE_SIZE;
1008                                 page = alloc_pages(sk->sk_allocation, 0);
1009                                 if (page == NULL) {
1010                                         err = -ENOMEM;
1011                                         goto error;
1012                                 }
1013                                 sk->sk_sndmsg_page = page;
1014                                 sk->sk_sndmsg_off = 0;
1015
1016                                 skb_fill_page_desc(skb, i, page, 0, 0);
1017                                 frag = &skb_shinfo(skb)->frags[i];
1018                                 skb->truesize += PAGE_SIZE;
1019                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1020                         } else {
1021                                 err = -EMSGSIZE;
1022                                 goto error;
1023                         }
1024                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1025                                 err = -EFAULT;
1026                                 goto error;
1027                         }
1028                         sk->sk_sndmsg_off += copy;
1029                         frag->size += copy;
1030                         skb->len += copy;
1031                         skb->data_len += copy;
1032                 }
1033                 offset += copy;
1034                 length -= copy;
1035         }
1036         return 0;
1037 error:
1038         inet->cork.length -= length;
1039         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1040         return err;
1041 }
1042
1043 int ip6_push_pending_frames(struct sock *sk)
1044 {
1045         struct sk_buff *skb, *tmp_skb;
1046         struct sk_buff **tail_skb;
1047         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1048         struct inet_sock *inet = inet_sk(sk);
1049         struct ipv6_pinfo *np = inet6_sk(sk);
1050         struct ipv6hdr *hdr;
1051         struct ipv6_txoptions *opt = np->cork.opt;
1052         struct rt6_info *rt = np->cork.rt;
1053         struct flowi *fl = &inet->cork.fl;
1054         unsigned char proto = fl->proto;
1055         int err = 0;
1056
1057         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1058                 goto out;
1059         tail_skb = &(skb_shinfo(skb)->frag_list);
1060
1061         /* move skb->data to ip header from ext header */
1062         if (skb->data < skb->nh.raw)
1063                 __skb_pull(skb, skb->nh.raw - skb->data);
1064         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1065                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1066                 *tail_skb = tmp_skb;
1067                 tail_skb = &(tmp_skb->next);
1068                 skb->len += tmp_skb->len;
1069                 skb->data_len += tmp_skb->len;
1070                 skb->truesize += tmp_skb->truesize;
1071                 __sock_put(tmp_skb->sk);
1072                 tmp_skb->destructor = NULL;
1073                 tmp_skb->sk = NULL;
1074         }
1075
1076         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1077         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1078         if (opt && opt->opt_flen)
1079                 ipv6_push_frag_opts(skb, opt, &proto);
1080         if (opt && opt->opt_nflen)
1081                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1082
1083         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1084
1085         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1086
1087         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1088                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1089         else
1090                 hdr->payload_len = 0;
1091         hdr->hop_limit = np->cork.hop_limit;
1092         hdr->nexthdr = proto;
1093         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1094         ipv6_addr_copy(&hdr->daddr, final_dst);
1095
1096         skb->dst = dst_clone(&rt->u.dst);
1097         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1098         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1099         if (err) {
1100                 if (err > 0)
1101                         err = np->recverr ? net_xmit_errno(err) : 0;
1102                 if (err)
1103                         goto error;
1104         }
1105
1106 out:
1107         inet->cork.flags &= ~IPCORK_OPT;
1108         if (np->cork.opt) {
1109                 kfree(np->cork.opt);
1110                 np->cork.opt = NULL;
1111         }
1112         if (np->cork.rt) {
1113                 dst_release(&np->cork.rt->u.dst);
1114                 np->cork.rt = NULL;
1115                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1116         }
1117         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1118         return err;
1119 error:
1120         goto out;
1121 }
1122
1123 void ip6_flush_pending_frames(struct sock *sk)
1124 {
1125         struct inet_sock *inet = inet_sk(sk);
1126         struct ipv6_pinfo *np = inet6_sk(sk);
1127         struct sk_buff *skb;
1128
1129         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1130                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1131                 kfree_skb(skb);
1132         }
1133
1134         inet->cork.flags &= ~IPCORK_OPT;
1135
1136         if (np->cork.opt) {
1137                 kfree(np->cork.opt);
1138                 np->cork.opt = NULL;
1139         }
1140         if (np->cork.rt) {
1141                 dst_release(&np->cork.rt->u.dst);
1142                 np->cork.rt = NULL;
1143                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1144         }
1145         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1146 }