net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *      Based on linux/net/ipv4/ip_output.c
  11  *
  12  *      This program is free software; you can redistribute it and/or
  13  *      modify it under the terms of the GNU General Public License
  14  *      as published by the Free Software Foundation; either version
  15  *      2 of the License, or (at your option) any later version.
  16  *
  17  *      Changes:
  18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  19  *                              extension headers are implemented.
  20  *                              route changes now work.
  21  *                              ip6_forward does not confuse sniffers.
  22  *                              etc.
  23  *
  24  *      H. von Brand    :       Added missing #include <linux/string.h>
  25  *      Imran Patel     :       frag id should be in NBO
  26  *      Kazunori MIYAZAWA @USAGI
  27  *                      :       add ip6_append_data and related functions
  28  *                              for datagram xmit
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/errno.h>
  33 #include <linux/types.h>
  34 #include <linux/string.h>
  35 #include <linux/socket.h>
  36 #include <linux/net.h>
  37 #include <linux/netdevice.h>
  38 #include <linux/if_arp.h>
  39 #include <linux/in6.h>
  40 #include <linux/tcp.h>
  41 #include <linux/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58
  59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  62 {
  63         static u32 ipv6_fragmentation_id = 1;
  64         static DEFINE_SPINLOCK(ip6_id_lock);
  65
  66         spin_lock_bh(&ip6_id_lock);
  67         fhdr->identification = htonl(ipv6_fragmentation_id);
  68         if (++ipv6_fragmentation_id == 0)
  69                 ipv6_fragmentation_id = 1;
  70         spin_unlock_bh(&ip6_id_lock);
  71 }
  72
  73 static inline int ip6_output_finish(struct sk_buff *skb)
  74 {
  75
  76         struct dst_entry *dst = skb->dst;
  77         struct hh_cache *hh = dst->hh;
  78
  79         if (hh) {
  80                 int hh_alen;
  81
  82                 read_lock_bh(&hh->hh_lock);
  83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
  84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
  85                 read_unlock_bh(&hh->hh_lock);
  86                 skb_push(skb, hh->hh_len);
  87                 return hh->hh_output(skb);
  88         } else if (dst->neighbour)
  89                 return dst->neighbour->output(skb);
  90
  91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
  92         kfree_skb(skb);
  93         return -EINVAL;
  94
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110
 111 static int ip6_output2(struct sk_buff *skb)
 112 {
 113         struct dst_entry *dst = skb->dst;
 114         struct net_device *dev = dst->dev;
 115
 116         skb->protocol = htons(ETH_P_IPV6);
 117         skb->dev = dev;
 118
 119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
 120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 121
 122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
 124                                 &skb->nh.ipv6h->saddr)) {
 125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 126
 127                         /* Do not check for IFF_ALLMULTI; multicast routing
 128                            is not supported in any case.
 129                          */
 130                         if (newskb)
 131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
 132                                         newskb->dev,
 133                                         ip6_dev_loopback_xmit);
 134
 135                         if (skb->nh.ipv6h->hop_limit == 0) {
 136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 137                                 kfree_skb(skb);
 138                                 return 0;
 139                         }
 140                 }
 141
 142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 143         }
 144
 145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 146 }
 147
 148 int ip6_output(struct sk_buff *skb)
 149 {
 150         if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
 151                 return ip6_fragment(skb, ip6_output2);
 152         else
 153                 return ip6_output2(skb);
 154 }
 155
 156 /*
 157  *      xmit an sk_buff (used by TCP)
 158  */
 159
 160 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 161              struct ipv6_txoptions *opt, int ipfragok)
 162 {
 163         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
 164         struct in6_addr *first_hop = &fl->fl6_dst;
 165         struct dst_entry *dst = skb->dst;
 166         struct ipv6hdr *hdr;
 167         u8  proto = fl->proto;
 168         int seg_len = skb->len;
 169         int hlimit;
 170         u32 mtu;
 171
 172         if (opt) {
 173                 int head_room;
 174
 175                 /* First: exthdrs may take lots of space (~8K for now)
 176                    MAX_HEADER is not enough.
 177                  */
 178                 head_room = opt->opt_nflen + opt->opt_flen;
 179                 seg_len += head_room;
 180                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 181
 182                 if (skb_headroom(skb) < head_room) {
 183                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 184                         kfree_skb(skb);
 185                         skb = skb2;
 186                         if (skb == NULL) {
 187                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 188                                 return -ENOBUFS;
 189                         }
 190                         if (sk)
 191                                 skb_set_owner_w(skb, sk);
 192                 }
 193                 if (opt->opt_flen)
 194                         ipv6_push_frag_opts(skb, opt, &proto);
 195                 if (opt->opt_nflen)
 196                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 197         }
 198
 199         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 200
 201         /*
 202          *      Fill in the IPv6 header
 203          */
 204
 205         *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
 206         hlimit = -1;
 207         if (np)
 208                 hlimit = np->hop_limit;
 209         if (hlimit < 0)
 210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
 211         if (hlimit < 0)
 212                 hlimit = ipv6_get_hoplimit(dst->dev);
 213
 214         hdr->payload_len = htons(seg_len);
 215         hdr->nexthdr = proto;
 216         hdr->hop_limit = hlimit;
 217
 218         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 219         ipv6_addr_copy(&hdr->daddr, first_hop);
 220
 221         mtu = dst_mtu(dst);
 222         if ((skb->len <= mtu) || ipfragok) {
 223                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 224                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
 225                                 dst_output);
 226         }
 227
 228         if (net_ratelimit())
 229                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 230         skb->dev = dst->dev;
 231         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 232         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 233         kfree_skb(skb);
 234         return -EMSGSIZE;
 235 }
 236
 237 /*
 238  *      To avoid extra problems ND packets are send through this
 239  *      routine. It's code duplication but I really want to avoid
 240  *      extra checks since ipv6_build_header is used by TCP (which
 241  *      is for us performance critical)
 242  */
 243
 244 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 245                struct in6_addr *saddr, struct in6_addr *daddr,
 246                int proto, int len)
 247 {
 248         struct ipv6_pinfo *np = inet6_sk(sk);
 249         struct ipv6hdr *hdr;
 250         int totlen;
 251
 252         skb->protocol = htons(ETH_P_IPV6);
 253         skb->dev = dev;
 254
 255         totlen = len + sizeof(struct ipv6hdr);
 256
 257         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
 258         skb->nh.ipv6h = hdr;
 259
 260         *(u32*)hdr = htonl(0x60000000);
 261
 262         hdr->payload_len = htons(len);
 263         hdr->nexthdr = proto;
 264         hdr->hop_limit = np->hop_limit;
 265
 266         ipv6_addr_copy(&hdr->saddr, saddr);
 267         ipv6_addr_copy(&hdr->daddr, daddr);
 268
 269         return 0;
 270 }
 271
 272 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 273 {
 274         struct ip6_ra_chain *ra;
 275         struct sock *last = NULL;
 276
 277         read_lock(&ip6_ra_lock);
 278         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 279                 struct sock *sk = ra->sk;
 280                 if (sk && ra->sel == sel &&
 281                     (!sk->sk_bound_dev_if ||
 282                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 283                         if (last) {
 284                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 285                                 if (skb2)
 286                                         rawv6_rcv(last, skb2);
 287                         }
 288                         last = sk;
 289                 }
 290         }
 291
 292         if (last) {
 293                 rawv6_rcv(last, skb);
 294                 read_unlock(&ip6_ra_lock);
 295                 return 1;
 296         }
 297         read_unlock(&ip6_ra_lock);
 298         return 0;
 299 }
 300
 301 static inline int ip6_forward_finish(struct sk_buff *skb)
 302 {
 303         return dst_output(skb);
 304 }
 305
 306 int ip6_forward(struct sk_buff *skb)
 307 {
 308         struct dst_entry *dst = skb->dst;
 309         struct ipv6hdr *hdr = skb->nh.ipv6h;
 310         struct inet6_skb_parm *opt = IP6CB(skb);
 311
 312         if (ipv6_devconf.forwarding == 0)
 313                 goto error;
 314
 315         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 316                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 317                 goto drop;
 318         }
 319
 320         skb->ip_summed = CHECKSUM_NONE;
 321
 322         /*
 323          *      We DO NOT make any processing on
 324          *      RA packets, pushing them to user level AS IS
 325          *      without ane WARRANTY that application will be able
 326          *      to interpret them. The reason is that we
 327          *      cannot make anything clever here.
 328          *
 329          *      We are not end-node, so that if packet contains
 330          *      AH/ESP, we cannot make anything.
 331          *      Defragmentation also would be mistake, RA packets
 332          *      cannot be fragmented, because there is no warranty
 333          *      that different fragments will go along one path. --ANK
 334          */
 335         if (opt->ra) {
 336                 u8 *ptr = skb->nh.raw + opt->ra;
 337                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 338                         return 0;
 339         }
 340
 341         /*
 342          *      check and decrement ttl
 343          */
 344         if (hdr->hop_limit <= 1) {
 345                 /* Force OUTPUT device used as source address */
 346                 skb->dev = dst->dev;
 347                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 348                             0, skb->dev);
 349
 350                 kfree_skb(skb);
 351                 return -ETIMEDOUT;
 352         }
 353
 354         if (!xfrm6_route_forward(skb)) {
 355                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 356                 goto drop;
 357         }
 358         dst = skb->dst;
 359
 360         /* IPv6 specs say nothing about it, but it is clear that we cannot
 361            send redirects to source routed frames.
 362          */
 363         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 364                 struct in6_addr *target = NULL;
 365                 struct rt6_info *rt;
 366                 struct neighbour *n = dst->neighbour;
 367
 368                 /*
 369                  *      incoming and outgoing devices are the same
 370                  *      send a redirect.
 371                  */
 372
 373                 rt = (struct rt6_info *) dst;
 374                 if ((rt->rt6i_flags & RTF_GATEWAY))
 375                         target = (struct in6_addr*)&n->primary_key;
 376                 else
 377                         target = &hdr->daddr;
 378
 379                 /* Limit redirects both by destination (here)
 380                    and by source (inside ndisc_send_redirect)
 381                  */
 382                 if (xrlim_allow(dst, 1*HZ))
 383                         ndisc_send_redirect(skb, n, target);
 384         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
 385                                                 |IPV6_ADDR_LINKLOCAL)) {
 386                 /* This check is security critical. */
 387                 goto error;
 388         }
 389
 390         if (skb->len > dst_mtu(dst)) {
 391                 /* Again, force OUTPUT device used as source address */
 392                 skb->dev = dst->dev;
 393                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 394                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 395                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 396                 kfree_skb(skb);
 397                 return -EMSGSIZE;
 398         }
 399
 400         if (skb_cow(skb, dst->dev->hard_header_len)) {
 401                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 402                 goto drop;
 403         }
 404
 405         hdr = skb->nh.ipv6h;
 406
 407         /* Mangling hops number delayed to point after skb COW */
 408
 409         hdr->hop_limit--;
 410
 411         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
 412         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
 413
 414 error:
 415         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 416 drop:
 417         kfree_skb(skb);
 418         return -EINVAL;
 419 }
 420
 421 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 422 {
 423         to->pkt_type = from->pkt_type;
 424         to->priority = from->priority;
 425         to->protocol = from->protocol;
 426         dst_release(to->dst);
 427         to->dst = dst_clone(from->dst);
 428         to->dev = from->dev;
 429
 430 #ifdef CONFIG_NET_SCHED
 431         to->tc_index = from->tc_index;
 432 #endif
 433 #ifdef CONFIG_NETFILTER
 434         to->nfmark = from->nfmark;
 435         /* Connection association is same as pre-frag packet */
 436         to->nfct = from->nfct;
 437         nf_conntrack_get(to->nfct);
 438         to->nfctinfo = from->nfctinfo;
 439 #ifdef CONFIG_BRIDGE_NETFILTER
 440         nf_bridge_put(to->nf_bridge);
 441         to->nf_bridge = from->nf_bridge;
 442         nf_bridge_get(to->nf_bridge);
 443 #endif
 444 #endif
 445 }
 446
 447 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 448 {
 449         u16 offset = sizeof(struct ipv6hdr);
 450         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
 451         unsigned int packet_len = skb->tail - skb->nh.raw;
 452         int found_rhdr = 0;
 453         *nexthdr = &skb->nh.ipv6h->nexthdr;
 454
 455         while (offset + 1 <= packet_len) {
 456
 457                 switch (**nexthdr) {
 458
 459                 case NEXTHDR_HOP:
 460                 case NEXTHDR_ROUTING:
 461                 case NEXTHDR_DEST:
 462                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
 463                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
 464                         offset += ipv6_optlen(exthdr);
 465                         *nexthdr = &exthdr->nexthdr;
 466                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
 467                         break;
 468                 default :
 469                         return offset;
 470                 }
 471         }
 472
 473         return offset;
 474 }
 475
 476 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 477 {
 478         struct net_device *dev;
 479         struct sk_buff *frag;
 480         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 481         struct ipv6hdr *tmp_hdr;
 482         struct frag_hdr *fh;
 483         unsigned int mtu, hlen, left, len;
 484         u32 frag_id = 0;
 485         int ptr, offset = 0, err=0;
 486         u8 *prevhdr, nexthdr = 0;
 487
 488         dev = rt->u.dst.dev;
 489         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 490         nexthdr = *prevhdr;
 491
 492         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 493
 494         if (skb_shinfo(skb)->frag_list) {
 495                 int first_len = skb_pagelen(skb);
 496
 497                 if (first_len - hlen > mtu ||
 498                     ((first_len - hlen) & 7) ||
 499                     skb_cloned(skb))
 500                         goto slow_path;
 501
 502                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 503                         /* Correct geometry. */
 504                         if (frag->len > mtu ||
 505                             ((frag->len & 7) && frag->next) ||
 506                             skb_headroom(frag) < hlen)
 507                             goto slow_path;
 508
 509                         /* Partially cloned skb? */
 510                         if (skb_shared(frag))
 511                                 goto slow_path;
 512
 513                         BUG_ON(frag->sk);
 514                         if (skb->sk) {
 515                                 sock_hold(skb->sk);
 516                                 frag->sk = skb->sk;
 517                                 frag->destructor = sock_wfree;
 518                                 skb->truesize -= frag->truesize;
 519                         }
 520                 }
 521
 522                 err = 0;
 523                 offset = 0;
 524                 frag = skb_shinfo(skb)->frag_list;
 525                 skb_shinfo(skb)->frag_list = NULL;
 526                 /* BUILD HEADER */
 527
 528                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
 529                 if (!tmp_hdr) {
 530                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 531                         return -ENOMEM;
 532                 }
 533
 534                 *prevhdr = NEXTHDR_FRAGMENT;
 535                 memcpy(tmp_hdr, skb->nh.raw, hlen);
 536                 __skb_pull(skb, hlen);
 537                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 538                 skb->nh.raw = __skb_push(skb, hlen);
 539                 memcpy(skb->nh.raw, tmp_hdr, hlen);
 540
 541                 ipv6_select_ident(skb, fh);
 542                 fh->nexthdr = nexthdr;
 543                 fh->reserved = 0;
 544                 fh->frag_off = htons(IP6_MF);
 545                 frag_id = fh->identification;
 546
 547                 first_len = skb_pagelen(skb);
 548                 skb->data_len = first_len - skb_headlen(skb);
 549                 skb->len = first_len;
 550                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 551
 552
 553                 for (;;) {
 554                         /* Prepare header of the next frame,
 555                          * before previous one went down. */
 556                         if (frag) {
 557                                 frag->ip_summed = CHECKSUM_NONE;
 558                                 frag->h.raw = frag->data;
 559                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 560                                 frag->nh.raw = __skb_push(frag, hlen);
 561                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
 562                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 563                                 fh->nexthdr = nexthdr;
 564                                 fh->reserved = 0;
 565                                 fh->frag_off = htons(offset);
 566                                 if (frag->next != NULL)
 567                                         fh->frag_off |= htons(IP6_MF);
 568                                 fh->identification = frag_id;
 569                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 570                                 ip6_copy_metadata(frag, skb);
 571                         }
 572
 573                         err = output(skb);
 574                         if (err || !frag)
 575                                 break;
 576
 577                         skb = frag;
 578                         frag = skb->next;
 579                         skb->next = NULL;
 580                 }
 581
 582                 if (tmp_hdr)
 583                         kfree(tmp_hdr);
 584
 585                 if (err == 0) {
 586                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 587                         return 0;
 588                 }
 589
 590                 while (frag) {
 591                         skb = frag->next;
 592                         kfree_skb(frag);
 593                         frag = skb;
 594                 }
 595
 596                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 597                 return err;
 598         }
 599
 600 slow_path:
 601         left = skb->len - hlen;         /* Space per frame */
 602         ptr = hlen;                     /* Where to start from */
 603
 604         /*
 605          *      Fragment the datagram.
 606          */
 607
 608         *prevhdr = NEXTHDR_FRAGMENT;
 609
 610         /*
 611          *      Keep copying data until we run out.
 612          */
 613         while(left > 0) {
 614                 len = left;
 615                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 616                 if (len > mtu)
 617                         len = mtu;
 618                 /* IF: we are not sending upto and including the packet end
 619                    then align the next start on an eight byte boundary */
 620                 if (len < left) {
 621                         len &= ~7;
 622                 }
 623                 /*
 624                  *      Allocate buffer.
 625                  */
 626
 627                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 628                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 629                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 630                         err = -ENOMEM;
 631                         goto fail;
 632                 }
 633
 634                 /*
 635                  *      Set up data on packet
 636                  */
 637
 638                 ip6_copy_metadata(frag, skb);
 639                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 640                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 641                 frag->nh.raw = frag->data;
 642                 fh = (struct frag_hdr*)(frag->data + hlen);
 643                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
 644
 645                 /*
 646                  *      Charge the memory for the fragment to any owner
 647                  *      it might possess
 648                  */
 649                 if (skb->sk)
 650                         skb_set_owner_w(frag, skb->sk);
 651
 652                 /*
 653                  *      Copy the packet header into the new buffer.
 654                  */
 655                 memcpy(frag->nh.raw, skb->data, hlen);
 656
 657                 /*
 658                  *      Build fragment header.
 659                  */
 660                 fh->nexthdr = nexthdr;
 661                 fh->reserved = 0;
 662                 if (frag_id) {
 663                         ipv6_select_ident(skb, fh);
 664                         frag_id = fh->identification;
 665                 } else
 666                         fh->identification = frag_id;
 667
 668                 /*
 669                  *      Copy a block of the IP datagram.
 670                  */
 671                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
 672                         BUG();
 673                 left -= len;
 674
 675                 fh->frag_off = htons(offset);
 676                 if (left > 0)
 677                         fh->frag_off |= htons(IP6_MF);
 678                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 679
 680                 ptr += len;
 681                 offset += len;
 682
 683                 /*
 684                  *      Put this fragment into the sending queue.
 685                  */
 686
 687                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 688
 689                 err = output(frag);
 690                 if (err)
 691                         goto fail;
 692         }
 693         kfree_skb(skb);
 694         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
 695         return err;
 696
 697 fail:
 698         kfree_skb(skb);
 699         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 700         return err;
 701 }
 702
 703 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 704 {
 705         int err = 0;
 706
 707         *dst = NULL;
 708         if (sk) {
 709                 struct ipv6_pinfo *np = inet6_sk(sk);
 710
 711                 *dst = sk_dst_check(sk, np->dst_cookie);
 712                 if (*dst) {
 713                         struct rt6_info *rt = (struct rt6_info*)*dst;
 714
 715                                 /* Yes, checking route validity in not connected
 716                                    case is not very simple. Take into account,
 717                                    that we do not support routing by source, TOS,
 718                                    and MSG_DONTROUTE            --ANK (980726)
 719
 720                                    1. If route was host route, check that
 721                                       cached destination is current.
 722                                       If it is network route, we still may
 723                                       check its validity using saved pointer
 724                                       to the last used address: daddr_cache.
 725                                       We do not want to save whole address now,
 726                                       (because main consumer of this service
 727                                        is tcp, which has not this problem),
 728                                       so that the last trick works only on connected
 729                                       sockets.
 730                                    2. oif also should be the same.
 731                                  */
 732
 733                         if (((rt->rt6i_dst.plen != 128 ||
 734                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
 735                              && (np->daddr_cache == NULL ||
 736                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
 737                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
 738                                 dst_release(*dst);
 739                                 *dst = NULL;
 740                         }
 741                 }
 742         }
 743
 744         if (*dst == NULL)
 745                 *dst = ip6_route_output(sk, fl);
 746
 747         if ((err = (*dst)->error))
 748                 goto out_err_release;
 749
 750         if (ipv6_addr_any(&fl->fl6_src)) {
 751                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 752
 753                 if (err)
 754                         goto out_err_release;
 755         }
 756
 757         return 0;
 758
 759 out_err_release:
 760         dst_release(*dst);
 761         *dst = NULL;
 762         return err;
 763 }
 764
 765 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
 766                     void *from, int length, int transhdrlen,
 767                     int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
 768                     unsigned int flags)
 769 {
 770         struct inet_sock *inet = inet_sk(sk);
 771         struct ipv6_pinfo *np = inet6_sk(sk);
 772         struct sk_buff *skb;
 773         unsigned int maxfraglen, fragheaderlen;
 774         int exthdrlen;
 775         int hh_len;
 776         int mtu;
 777         int copy;
 778         int err;
 779         int offset = 0;
 780         int csummode = CHECKSUM_NONE;
 781
 782         if (flags&MSG_PROBE)
 783                 return 0;
 784         if (skb_queue_empty(&sk->sk_write_queue)) {
 785                 /*
 786                  * setup for corking
 787                  */
 788                 if (opt) {
 789                         if (np->cork.opt == NULL) {
 790                                 np->cork.opt = kmalloc(opt->tot_len,
 791                                                        sk->sk_allocation);
 792                                 if (unlikely(np->cork.opt == NULL))
 793                                         return -ENOBUFS;
 794                         } else if (np->cork.opt->tot_len < opt->tot_len) {
 795                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
 796                                 return -EINVAL;
 797                         }
 798                         memcpy(np->cork.opt, opt, opt->tot_len);
 799                         inet->cork.flags |= IPCORK_OPT;
 800                         /* need source address above miyazawa*/
 801                 }
 802                 dst_hold(&rt->u.dst);
 803                 np->cork.rt = rt;
 804                 inet->cork.fl = *fl;
 805                 np->cork.hop_limit = hlimit;
 806                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 807                 if (dst_allfrag(rt->u.dst.path))
 808                         inet->cork.flags |= IPCORK_ALLFRAG;
 809                 inet->cork.length = 0;
 810                 sk->sk_sndmsg_page = NULL;
 811                 sk->sk_sndmsg_off = 0;
 812                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
 813                 length += exthdrlen;
 814                 transhdrlen += exthdrlen;
 815         } else {
 816                 rt = np->cork.rt;
 817                 fl = &inet->cork.fl;
 818                 if (inet->cork.flags & IPCORK_OPT)
 819                         opt = np->cork.opt;
 820                 transhdrlen = 0;
 821                 exthdrlen = 0;
 822                 mtu = inet->cork.fragsize;
 823         }
 824
 825         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 826
 827         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
 828         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 829
 830         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 831                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
 832                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
 833                         return -EMSGSIZE;
 834                 }
 835         }
 836
 837         /*
 838          * Let's try using as much space as possible.
 839          * Use MTU if total length of the message fits into the MTU.
 840          * Otherwise, we need to reserve fragment header and
 841          * fragment alignment (= 8-15 octects, in total).
 842          *
 843          * Note that we may need to "move" the data from the tail of
 844          * of the buffer to the new fragment when we split
 845          * the message.
 846          *
 847          * FIXME: It may be fragmented into multiple chunks
 848          *        at once if non-fragmentable extension headers
 849          *        are too large.
 850          * --yoshfuji
 851          */
 852
 853         inet->cork.length += length;
 854
 855         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 856                 goto alloc_new_skb;
 857
 858         while (length > 0) {
 859                 /* Check if the remaining data fits into current packet. */
 860                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
 861                 if (copy < length)
 862                         copy = maxfraglen - skb->len;
 863
 864                 if (copy <= 0) {
 865                         char *data;
 866                         unsigned int datalen;
 867                         unsigned int fraglen;
 868                         unsigned int fraggap;
 869                         unsigned int alloclen;
 870                         struct sk_buff *skb_prev;
 871 alloc_new_skb:
 872                         skb_prev = skb;
 873
 874                         /* There's no room in the current skb */
 875                         if (skb_prev)
 876                                 fraggap = skb_prev->len - maxfraglen;
 877                         else
 878                                 fraggap = 0;
 879
 880                         /*
 881                          * If remaining data exceeds the mtu,
 882                          * we know we need more fragment(s).
 883                          */
 884                         datalen = length + fraggap;
 885                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 886                                 datalen = maxfraglen - fragheaderlen;
 887
 888                         fraglen = datalen + fragheaderlen;
 889                         if ((flags & MSG_MORE) &&
 890                             !(rt->u.dst.dev->features&NETIF_F_SG))
 891                                 alloclen = mtu;
 892                         else
 893                                 alloclen = datalen + fragheaderlen;
 894
 895                         /*
 896                          * The last fragment gets additional space at tail.
 897                          * Note: we overallocate on fragments with MSG_MODE
 898                          * because we have no idea if we're the last one.
 899                          */
 900                         if (datalen == length + fraggap)
 901                                 alloclen += rt->u.dst.trailer_len;
 902
 903                         /*
 904                          * We just reserve space for fragment header.
 905                          * Note: this may be overallocation if the message
 906                          * (without MSG_MORE) fits into the MTU.
 907                          */
 908                         alloclen += sizeof(struct frag_hdr);
 909
 910                         if (transhdrlen) {
 911                                 skb = sock_alloc_send_skb(sk,
 912                                                 alloclen + hh_len,
 913                                                 (flags & MSG_DONTWAIT), &err);
 914                         } else {
 915                                 skb = NULL;
 916                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 917                                     2 * sk->sk_sndbuf)
 918                                         skb = sock_wmalloc(sk,
 919                                                            alloclen + hh_len, 1,
 920                                                            sk->sk_allocation);
 921                                 if (unlikely(skb == NULL))
 922                                         err = -ENOBUFS;
 923                         }
 924                         if (skb == NULL)
 925                                 goto error;
 926                         /*
 927                          *      Fill in the control structures
 928                          */
 929                         skb->ip_summed = csummode;
 930                         skb->csum = 0;
 931                         /* reserve for fragmentation */
 932                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
 933
 934                         /*
 935                          *      Find where to start putting bytes
 936                          */
 937                         data = skb_put(skb, fraglen);
 938                         skb->nh.raw = data + exthdrlen;
 939                         data += fragheaderlen;
 940                         skb->h.raw = data + exthdrlen;
 941
 942                         if (fraggap) {
 943                                 skb->csum = skb_copy_and_csum_bits(
 944                                         skb_prev, maxfraglen,
 945                                         data + transhdrlen, fraggap, 0);
 946                                 skb_prev->csum = csum_sub(skb_prev->csum,
 947                                                           skb->csum);
 948                                 data += fraggap;
 949                                 skb_trim(skb_prev, maxfraglen);
 950                         }
 951                         copy = datalen - transhdrlen - fraggap;
 952                         if (copy < 0) {
 953                                 err = -EINVAL;
 954                                 kfree_skb(skb);
 955                                 goto error;
 956                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 957                                 err = -EFAULT;
 958                                 kfree_skb(skb);
 959                                 goto error;
 960                         }
 961
 962                         offset += copy;
 963                         length -= datalen - fraggap;
 964                         transhdrlen = 0;
 965                         exthdrlen = 0;
 966                         csummode = CHECKSUM_NONE;
 967
 968                         /*
 969                          * Put the packet on the pending queue
 970                          */
 971                         __skb_queue_tail(&sk->sk_write_queue, skb);
 972                         continue;
 973                 }
 974
 975                 if (copy > length)
 976                         copy = length;
 977
 978                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 979                         unsigned int off;
 980
 981                         off = skb->len;
 982                         if (getfrag(from, skb_put(skb, copy),
 983                                                 offset, copy, off, skb) < 0) {
 984                                 __skb_trim(skb, off);
 985                                 err = -EFAULT;
 986                                 goto error;
 987                         }
 988                 } else {
 989                         int i = skb_shinfo(skb)->nr_frags;
 990                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 991                         struct page *page = sk->sk_sndmsg_page;
 992                         int off = sk->sk_sndmsg_off;
 993                         unsigned int left;
 994
 995                         if (page && (left = PAGE_SIZE - off) > 0) {
 996                                 if (copy >= left)
 997                                         copy = left;
 998                                 if (page != frag->page) {
 999                                         if (i == MAX_SKB_FRAGS) {
1000                                                 err = -EMSGSIZE;
1001                                                 goto error;
1002                                         }
1003                                         get_page(page);
1004                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1005                                         frag = &skb_shinfo(skb)->frags[i];
1006                                 }
1007                         } else if(i < MAX_SKB_FRAGS) {
1008                                 if (copy > PAGE_SIZE)
1009                                         copy = PAGE_SIZE;
1010                                 page = alloc_pages(sk->sk_allocation, 0);
1011                                 if (page == NULL) {
1012                                         err = -ENOMEM;
1013                                         goto error;
1014                                 }
1015                                 sk->sk_sndmsg_page = page;
1016                                 sk->sk_sndmsg_off = 0;
1017
1018                                 skb_fill_page_desc(skb, i, page, 0, 0);
1019                                 frag = &skb_shinfo(skb)->frags[i];
1020                                 skb->truesize += PAGE_SIZE;
1021                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1022                         } else {
1023                                 err = -EMSGSIZE;
1024                                 goto error;
1025                         }
1026                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1027                                 err = -EFAULT;
1028                                 goto error;
1029                         }
1030                         sk->sk_sndmsg_off += copy;
1031                         frag->size += copy;
1032                         skb->len += copy;
1033                         skb->data_len += copy;
1034                 }
1035                 offset += copy;
1036                 length -= copy;
1037         }
1038         return 0;
1039 error:
1040         inet->cork.length -= length;
1041         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1042         return err;
1043 }
1044
1045 int ip6_push_pending_frames(struct sock *sk)
1046 {
1047         struct sk_buff *skb, *tmp_skb;
1048         struct sk_buff **tail_skb;
1049         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1050         struct inet_sock *inet = inet_sk(sk);
1051         struct ipv6_pinfo *np = inet6_sk(sk);
1052         struct ipv6hdr *hdr;
1053         struct ipv6_txoptions *opt = np->cork.opt;
1054         struct rt6_info *rt = np->cork.rt;
1055         struct flowi *fl = &inet->cork.fl;
1056         unsigned char proto = fl->proto;
1057         int err = 0;
1058
1059         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1060                 goto out;
1061         tail_skb = &(skb_shinfo(skb)->frag_list);
1062
1063         /* move skb->data to ip header from ext header */
1064         if (skb->data < skb->nh.raw)
1065                 __skb_pull(skb, skb->nh.raw - skb->data);
1066         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1067                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1068                 *tail_skb = tmp_skb;
1069                 tail_skb = &(tmp_skb->next);
1070                 skb->len += tmp_skb->len;
1071                 skb->data_len += tmp_skb->len;
1072                 skb->truesize += tmp_skb->truesize;
1073                 __sock_put(tmp_skb->sk);
1074                 tmp_skb->destructor = NULL;
1075                 tmp_skb->sk = NULL;
1076         }
1077
1078         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1079         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1080         if (opt && opt->opt_flen)
1081                 ipv6_push_frag_opts(skb, opt, &proto);
1082         if (opt && opt->opt_nflen)
1083                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1084
1085         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1086
1087         *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1088
1089         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1090                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1091         else
1092                 hdr->payload_len = 0;
1093         hdr->hop_limit = np->cork.hop_limit;
1094         hdr->nexthdr = proto;
1095         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1096         ipv6_addr_copy(&hdr->daddr, final_dst);
1097
1098         skb->dst = dst_clone(&rt->u.dst);
1099         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1100         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1101         if (err) {
1102                 if (err > 0)
1103                         err = np->recverr ? net_xmit_errno(err) : 0;
1104                 if (err)
1105                         goto error;
1106         }
1107
1108 out:
1109         inet->cork.flags &= ~IPCORK_OPT;
1110         if (np->cork.opt) {
1111                 kfree(np->cork.opt);
1112                 np->cork.opt = NULL;
1113         }
1114         if (np->cork.rt) {
1115                 dst_release(&np->cork.rt->u.dst);
1116                 np->cork.rt = NULL;
1117                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1118         }
1119         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1120         return err;
1121 error:
1122         goto out;
1123 }
1124
1125 void ip6_flush_pending_frames(struct sock *sk)
1126 {
1127         struct inet_sock *inet = inet_sk(sk);
1128         struct ipv6_pinfo *np = inet6_sk(sk);
1129         struct sk_buff *skb;
1130
1131         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1132                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1133                 kfree_skb(skb);
1134         }
1135
1136         inet->cork.flags &= ~IPCORK_OPT;
1137
1138         if (np->cork.opt) {
1139                 kfree(np->cork.opt);
1140                 np->cork.opt = NULL;
1141         }
1142         if (np->cork.rt) {
1143                 dst_release(&np->cork.rt->u.dst);
1144                 np->cork.rt = NULL;
1145                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1146         }
1147         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1148 }