net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <asm/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120
 121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 122 EXPORT_SYMBOL_GPL(unix_socket_table);
 123 DEFINE_SPINLOCK(unix_table_lock);
 124 EXPORT_SYMBOL_GPL(unix_table_lock);
 125 static atomic_long_t unix_nr_socks;
 126
 127
 128 static struct hlist_head *unix_sockets_unbound(void *addr)
 129 {
 130         unsigned long hash = (unsigned long)addr;
 131
 132         hash ^= hash >> 16;
 133         hash ^= hash >> 8;
 134         hash %= UNIX_HASH_SIZE;
 135         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 136 }
 137
 138 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 139
 140 #ifdef CONFIG_SECURITY_NETWORK
 141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 142 {
 143         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 144 }
 145
 146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 147 {
 148         scm->secid = *UNIXSID(skb);
 149 }
 150 #else
 151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 152 { }
 153
 154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 155 { }
 156 #endif /* CONFIG_SECURITY_NETWORK */
 157
 158 /*
 159  *  SMP locking strategy:
 160  *    hash table is protected with spinlock unix_table_lock
 161  *    each socket state is protected by separate spin lock.
 162  */
 163
 164 static inline unsigned int unix_hash_fold(__wsum n)
 165 {
 166         unsigned int hash = (__force unsigned int)csum_fold(n);
 167
 168         hash ^= hash>>8;
 169         return hash&(UNIX_HASH_SIZE-1);
 170 }
 171
 172 #define unix_peer(sk) (unix_sk(sk)->peer)
 173
 174 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 175 {
 176         return unix_peer(osk) == sk;
 177 }
 178
 179 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 180 {
 181         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 182 }
 183
 184 static inline int unix_recvq_full(struct sock const *sk)
 185 {
 186         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 187 }
 188
 189 struct sock *unix_peer_get(struct sock *s)
 190 {
 191         struct sock *peer;
 192
 193         unix_state_lock(s);
 194         peer = unix_peer(s);
 195         if (peer)
 196                 sock_hold(peer);
 197         unix_state_unlock(s);
 198         return peer;
 199 }
 200 EXPORT_SYMBOL_GPL(unix_peer_get);
 201
 202 static inline void unix_release_addr(struct unix_address *addr)
 203 {
 204         if (atomic_dec_and_test(&addr->refcnt))
 205                 kfree(addr);
 206 }
 207
 208 /*
 209  *      Check unix socket name:
 210  *              - should be not zero length.
 211  *              - if started by not zero, should be NULL terminated (FS object)
 212  *              - if started by zero, it is abstract name.
 213  */
 214
 215 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 216 {
 217         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 218                 return -EINVAL;
 219         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 220                 return -EINVAL;
 221         if (sunaddr->sun_path[0]) {
 222                 /*
 223                  * This may look like an off by one error but it is a bit more
 224                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 225                  * sun_path[108] doesn't as such exist.  However in kernel space
 226                  * we are guaranteed that it is a valid memory location in our
 227                  * kernel address buffer.
 228                  */
 229                 ((char *)sunaddr)[len] = 0;
 230                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 231                 return len;
 232         }
 233
 234         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 235         return len;
 236 }
 237
 238 static void __unix_remove_socket(struct sock *sk)
 239 {
 240         sk_del_node_init(sk);
 241 }
 242
 243 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 244 {
 245         WARN_ON(!sk_unhashed(sk));
 246         sk_add_node(sk, list);
 247 }
 248
 249 static inline void unix_remove_socket(struct sock *sk)
 250 {
 251         spin_lock(&unix_table_lock);
 252         __unix_remove_socket(sk);
 253         spin_unlock(&unix_table_lock);
 254 }
 255
 256 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 257 {
 258         spin_lock(&unix_table_lock);
 259         __unix_insert_socket(list, sk);
 260         spin_unlock(&unix_table_lock);
 261 }
 262
 263 static struct sock *__unix_find_socket_byname(struct net *net,
 264                                               struct sockaddr_un *sunname,
 265                                               int len, int type, unsigned int hash)
 266 {
 267         struct sock *s;
 268
 269         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 270                 struct unix_sock *u = unix_sk(s);
 271
 272                 if (!net_eq(sock_net(s), net))
 273                         continue;
 274
 275                 if (u->addr->len == len &&
 276                     !memcmp(u->addr->name, sunname, len))
 277                         goto found;
 278         }
 279         s = NULL;
 280 found:
 281         return s;
 282 }
 283
 284 static inline struct sock *unix_find_socket_byname(struct net *net,
 285                                                    struct sockaddr_un *sunname,
 286                                                    int len, int type,
 287                                                    unsigned int hash)
 288 {
 289         struct sock *s;
 290
 291         spin_lock(&unix_table_lock);
 292         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 293         if (s)
 294                 sock_hold(s);
 295         spin_unlock(&unix_table_lock);
 296         return s;
 297 }
 298
 299 static struct sock *unix_find_socket_byinode(struct inode *i)
 300 {
 301         struct sock *s;
 302
 303         spin_lock(&unix_table_lock);
 304         sk_for_each(s,
 305                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 306                 struct dentry *dentry = unix_sk(s)->path.dentry;
 307
 308                 if (dentry && dentry->d_inode == i) {
 309                         sock_hold(s);
 310                         goto found;
 311                 }
 312         }
 313         s = NULL;
 314 found:
 315         spin_unlock(&unix_table_lock);
 316         return s;
 317 }
 318
 319 static inline int unix_writable(struct sock *sk)
 320 {
 321         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 322 }
 323
 324 static void unix_write_space(struct sock *sk)
 325 {
 326         struct socket_wq *wq;
 327
 328         rcu_read_lock();
 329         if (unix_writable(sk)) {
 330                 wq = rcu_dereference(sk->sk_wq);
 331                 if (wq_has_sleeper(wq))
 332                         wake_up_interruptible_sync_poll(&wq->wait,
 333                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 334                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 335         }
 336         rcu_read_unlock();
 337 }
 338
 339 /* When dgram socket disconnects (or changes its peer), we clear its receive
 340  * queue of packets arrived from previous peer. First, it allows to do
 341  * flow control based only on wmem_alloc; second, sk connected to peer
 342  * may receive messages only from that peer. */
 343 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 344 {
 345         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 346                 skb_queue_purge(&sk->sk_receive_queue);
 347                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 348
 349                 /* If one link of bidirectional dgram pipe is disconnected,
 350                  * we signal error. Messages are lost. Do not make this,
 351                  * when peer was not connected to us.
 352                  */
 353                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 354                         other->sk_err = ECONNRESET;
 355                         other->sk_error_report(other);
 356                 }
 357         }
 358 }
 359
 360 static void unix_sock_destructor(struct sock *sk)
 361 {
 362         struct unix_sock *u = unix_sk(sk);
 363
 364         skb_queue_purge(&sk->sk_receive_queue);
 365
 366         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 367         WARN_ON(!sk_unhashed(sk));
 368         WARN_ON(sk->sk_socket);
 369         if (!sock_flag(sk, SOCK_DEAD)) {
 370                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 371                 return;
 372         }
 373
 374         if (u->addr)
 375                 unix_release_addr(u->addr);
 376
 377         atomic_long_dec(&unix_nr_socks);
 378         local_bh_disable();
 379         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 380         local_bh_enable();
 381 #ifdef UNIX_REFCNT_DEBUG
 382         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 383                 atomic_long_read(&unix_nr_socks));
 384 #endif
 385 }
 386
 387 static void unix_release_sock(struct sock *sk, int embrion)
 388 {
 389         struct unix_sock *u = unix_sk(sk);
 390         struct path path;
 391         struct sock *skpair;
 392         struct sk_buff *skb;
 393         int state;
 394
 395         unix_remove_socket(sk);
 396
 397         /* Clear state */
 398         unix_state_lock(sk);
 399         sock_orphan(sk);
 400         sk->sk_shutdown = SHUTDOWN_MASK;
 401         path         = u->path;
 402         u->path.dentry = NULL;
 403         u->path.mnt = NULL;
 404         state = sk->sk_state;
 405         sk->sk_state = TCP_CLOSE;
 406         unix_state_unlock(sk);
 407
 408         wake_up_interruptible_all(&u->peer_wait);
 409
 410         skpair = unix_peer(sk);
 411
 412         if (skpair != NULL) {
 413                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 414                         unix_state_lock(skpair);
 415                         /* No more writes */
 416                         skpair->sk_shutdown = SHUTDOWN_MASK;
 417                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 418                                 skpair->sk_err = ECONNRESET;
 419                         unix_state_unlock(skpair);
 420                         skpair->sk_state_change(skpair);
 421                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 422                 }
 423                 sock_put(skpair); /* It may now die */
 424                 unix_peer(sk) = NULL;
 425         }
 426
 427         /* Try to flush out this socket. Throw out buffers at least */
 428
 429         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 430                 if (state == TCP_LISTEN)
 431                         unix_release_sock(skb->sk, 1);
 432                 /* passed fds are erased in the kfree_skb hook        */
 433                 kfree_skb(skb);
 434         }
 435
 436         if (path.dentry)
 437                 path_put(&path);
 438
 439         sock_put(sk);
 440
 441         /* ---- Socket is dead now and most probably destroyed ---- */
 442
 443         /*
 444          * Fixme: BSD difference: In BSD all sockets connected to us get
 445          *        ECONNRESET and we die on the spot. In Linux we behave
 446          *        like files and pipes do and wait for the last
 447          *        dereference.
 448          *
 449          * Can't we simply set sock->err?
 450          *
 451          *        What the above comment does talk about? --ANK(980817)
 452          */
 453
 454         if (unix_tot_inflight)
 455                 unix_gc();              /* Garbage collect fds */
 456 }
 457
 458 static void init_peercred(struct sock *sk)
 459 {
 460         put_pid(sk->sk_peer_pid);
 461         if (sk->sk_peer_cred)
 462                 put_cred(sk->sk_peer_cred);
 463         sk->sk_peer_pid  = get_pid(task_tgid(current));
 464         sk->sk_peer_cred = get_current_cred();
 465 }
 466
 467 static void copy_peercred(struct sock *sk, struct sock *peersk)
 468 {
 469         put_pid(sk->sk_peer_pid);
 470         if (sk->sk_peer_cred)
 471                 put_cred(sk->sk_peer_cred);
 472         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 473         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 474 }
 475
 476 static int unix_listen(struct socket *sock, int backlog)
 477 {
 478         int err;
 479         struct sock *sk = sock->sk;
 480         struct unix_sock *u = unix_sk(sk);
 481         struct pid *old_pid = NULL;
 482
 483         err = -EOPNOTSUPP;
 484         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 485                 goto out;       /* Only stream/seqpacket sockets accept */
 486         err = -EINVAL;
 487         if (!u->addr)
 488                 goto out;       /* No listens on an unbound socket */
 489         unix_state_lock(sk);
 490         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 491                 goto out_unlock;
 492         if (backlog > sk->sk_max_ack_backlog)
 493                 wake_up_interruptible_all(&u->peer_wait);
 494         sk->sk_max_ack_backlog  = backlog;
 495         sk->sk_state            = TCP_LISTEN;
 496         /* set credentials so connect can copy them */
 497         init_peercred(sk);
 498         err = 0;
 499
 500 out_unlock:
 501         unix_state_unlock(sk);
 502         put_pid(old_pid);
 503 out:
 504         return err;
 505 }
 506
 507 static int unix_release(struct socket *);
 508 static int unix_bind(struct socket *, struct sockaddr *, int);
 509 static int unix_stream_connect(struct socket *, struct sockaddr *,
 510                                int addr_len, int flags);
 511 static int unix_socketpair(struct socket *, struct socket *);
 512 static int unix_accept(struct socket *, struct socket *, int);
 513 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 514 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 515 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 516                                     poll_table *);
 517 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 518 static int unix_shutdown(struct socket *, int);
 519 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 520 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 521 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 522 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 523 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 524                               int, int);
 525 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 526 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 527                                   int);
 528
 529 static int unix_set_peek_off(struct sock *sk, int val)
 530 {
 531         struct unix_sock *u = unix_sk(sk);
 532
 533         if (mutex_lock_interruptible(&u->readlock))
 534                 return -EINTR;
 535
 536         sk->sk_peek_off = val;
 537         mutex_unlock(&u->readlock);
 538
 539         return 0;
 540 }
 541
 542
 543 static const struct proto_ops unix_stream_ops = {
 544         .family =       PF_UNIX,
 545         .owner =        THIS_MODULE,
 546         .release =      unix_release,
 547         .bind =         unix_bind,
 548         .connect =      unix_stream_connect,
 549         .socketpair =   unix_socketpair,
 550         .accept =       unix_accept,
 551         .getname =      unix_getname,
 552         .poll =         unix_poll,
 553         .ioctl =        unix_ioctl,
 554         .listen =       unix_listen,
 555         .shutdown =     unix_shutdown,
 556         .setsockopt =   sock_no_setsockopt,
 557         .getsockopt =   sock_no_getsockopt,
 558         .sendmsg =      unix_stream_sendmsg,
 559         .recvmsg =      unix_stream_recvmsg,
 560         .mmap =         sock_no_mmap,
 561         .sendpage =     sock_no_sendpage,
 562         .set_peek_off = unix_set_peek_off,
 563 };
 564
 565 static const struct proto_ops unix_dgram_ops = {
 566         .family =       PF_UNIX,
 567         .owner =        THIS_MODULE,
 568         .release =      unix_release,
 569         .bind =         unix_bind,
 570         .connect =      unix_dgram_connect,
 571         .socketpair =   unix_socketpair,
 572         .accept =       sock_no_accept,
 573         .getname =      unix_getname,
 574         .poll =         unix_dgram_poll,
 575         .ioctl =        unix_ioctl,
 576         .listen =       sock_no_listen,
 577         .shutdown =     unix_shutdown,
 578         .setsockopt =   sock_no_setsockopt,
 579         .getsockopt =   sock_no_getsockopt,
 580         .sendmsg =      unix_dgram_sendmsg,
 581         .recvmsg =      unix_dgram_recvmsg,
 582         .mmap =         sock_no_mmap,
 583         .sendpage =     sock_no_sendpage,
 584         .set_peek_off = unix_set_peek_off,
 585 };
 586
 587 static const struct proto_ops unix_seqpacket_ops = {
 588         .family =       PF_UNIX,
 589         .owner =        THIS_MODULE,
 590         .release =      unix_release,
 591         .bind =         unix_bind,
 592         .connect =      unix_stream_connect,
 593         .socketpair =   unix_socketpair,
 594         .accept =       unix_accept,
 595         .getname =      unix_getname,
 596         .poll =         unix_dgram_poll,
 597         .ioctl =        unix_ioctl,
 598         .listen =       unix_listen,
 599         .shutdown =     unix_shutdown,
 600         .setsockopt =   sock_no_setsockopt,
 601         .getsockopt =   sock_no_getsockopt,
 602         .sendmsg =      unix_seqpacket_sendmsg,
 603         .recvmsg =      unix_seqpacket_recvmsg,
 604         .mmap =         sock_no_mmap,
 605         .sendpage =     sock_no_sendpage,
 606         .set_peek_off = unix_set_peek_off,
 607 };
 608
 609 static struct proto unix_proto = {
 610         .name                   = "UNIX",
 611         .owner                  = THIS_MODULE,
 612         .obj_size               = sizeof(struct unix_sock),
 613 };
 614
 615 /*
 616  * AF_UNIX sockets do not interact with hardware, hence they
 617  * dont trigger interrupts - so it's safe for them to have
 618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 619  * this special lock-class by reinitializing the spinlock key:
 620  */
 621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 622
 623 static struct sock *unix_create1(struct net *net, struct socket *sock)
 624 {
 625         struct sock *sk = NULL;
 626         struct unix_sock *u;
 627
 628         atomic_long_inc(&unix_nr_socks);
 629         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 630                 goto out;
 631
 632         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 633         if (!sk)
 634                 goto out;
 635
 636         sock_init_data(sock, sk);
 637         lockdep_set_class(&sk->sk_receive_queue.lock,
 638                                 &af_unix_sk_receive_queue_lock_key);
 639
 640         sk->sk_write_space      = unix_write_space;
 641         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 642         sk->sk_destruct         = unix_sock_destructor;
 643         u         = unix_sk(sk);
 644         u->path.dentry = NULL;
 645         u->path.mnt = NULL;
 646         spin_lock_init(&u->lock);
 647         atomic_long_set(&u->inflight, 0);
 648         INIT_LIST_HEAD(&u->link);
 649         mutex_init(&u->readlock); /* single task reading lock */
 650         init_waitqueue_head(&u->peer_wait);
 651         unix_insert_socket(unix_sockets_unbound(sk), sk);
 652 out:
 653         if (sk == NULL)
 654                 atomic_long_dec(&unix_nr_socks);
 655         else {
 656                 local_bh_disable();
 657                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 658                 local_bh_enable();
 659         }
 660         return sk;
 661 }
 662
 663 static int unix_create(struct net *net, struct socket *sock, int protocol,
 664                        int kern)
 665 {
 666         if (protocol && protocol != PF_UNIX)
 667                 return -EPROTONOSUPPORT;
 668
 669         sock->state = SS_UNCONNECTED;
 670
 671         switch (sock->type) {
 672         case SOCK_STREAM:
 673                 sock->ops = &unix_stream_ops;
 674                 break;
 675                 /*
 676                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 677                  *      nothing uses it.
 678                  */
 679         case SOCK_RAW:
 680                 sock->type = SOCK_DGRAM;
 681         case SOCK_DGRAM:
 682                 sock->ops = &unix_dgram_ops;
 683                 break;
 684         case SOCK_SEQPACKET:
 685                 sock->ops = &unix_seqpacket_ops;
 686                 break;
 687         default:
 688                 return -ESOCKTNOSUPPORT;
 689         }
 690
 691         return unix_create1(net, sock) ? 0 : -ENOMEM;
 692 }
 693
 694 static int unix_release(struct socket *sock)
 695 {
 696         struct sock *sk = sock->sk;
 697
 698         if (!sk)
 699                 return 0;
 700
 701         unix_release_sock(sk, 0);
 702         sock->sk = NULL;
 703
 704         return 0;
 705 }
 706
 707 static int unix_autobind(struct socket *sock)
 708 {
 709         struct sock *sk = sock->sk;
 710         struct net *net = sock_net(sk);
 711         struct unix_sock *u = unix_sk(sk);
 712         static u32 ordernum = 1;
 713         struct unix_address *addr;
 714         int err;
 715         unsigned int retries = 0;
 716
 717         err = mutex_lock_interruptible(&u->readlock);
 718         if (err)
 719                 return err;
 720
 721         err = 0;
 722         if (u->addr)
 723                 goto out;
 724
 725         err = -ENOMEM;
 726         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 727         if (!addr)
 728                 goto out;
 729
 730         addr->name->sun_family = AF_UNIX;
 731         atomic_set(&addr->refcnt, 1);
 732
 733 retry:
 734         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 735         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 736
 737         spin_lock(&unix_table_lock);
 738         ordernum = (ordernum+1)&0xFFFFF;
 739
 740         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 741                                       addr->hash)) {
 742                 spin_unlock(&unix_table_lock);
 743                 /*
 744                  * __unix_find_socket_byname() may take long time if many names
 745                  * are already in use.
 746                  */
 747                 cond_resched();
 748                 /* Give up if all names seems to be in use. */
 749                 if (retries++ == 0xFFFFF) {
 750                         err = -ENOSPC;
 751                         kfree(addr);
 752                         goto out;
 753                 }
 754                 goto retry;
 755         }
 756         addr->hash ^= sk->sk_type;
 757
 758         __unix_remove_socket(sk);
 759         u->addr = addr;
 760         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 761         spin_unlock(&unix_table_lock);
 762         err = 0;
 763
 764 out:    mutex_unlock(&u->readlock);
 765         return err;
 766 }
 767
 768 static struct sock *unix_find_other(struct net *net,
 769                                     struct sockaddr_un *sunname, int len,
 770                                     int type, unsigned int hash, int *error)
 771 {
 772         struct sock *u;
 773         struct path path;
 774         int err = 0;
 775
 776         if (sunname->sun_path[0]) {
 777                 struct inode *inode;
 778                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 779                 if (err)
 780                         goto fail;
 781                 inode = path.dentry->d_inode;
 782                 err = inode_permission(inode, MAY_WRITE);
 783                 if (err)
 784                         goto put_fail;
 785
 786                 err = -ECONNREFUSED;
 787                 if (!S_ISSOCK(inode->i_mode))
 788                         goto put_fail;
 789                 u = unix_find_socket_byinode(inode);
 790                 if (!u)
 791                         goto put_fail;
 792
 793                 if (u->sk_type == type)
 794                         touch_atime(&path);
 795
 796                 path_put(&path);
 797
 798                 err = -EPROTOTYPE;
 799                 if (u->sk_type != type) {
 800                         sock_put(u);
 801                         goto fail;
 802                 }
 803         } else {
 804                 err = -ECONNREFUSED;
 805                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 806                 if (u) {
 807                         struct dentry *dentry;
 808                         dentry = unix_sk(u)->path.dentry;
 809                         if (dentry)
 810                                 touch_atime(&unix_sk(u)->path);
 811                 } else
 812                         goto fail;
 813         }
 814         return u;
 815
 816 put_fail:
 817         path_put(&path);
 818 fail:
 819         *error = err;
 820         return NULL;
 821 }
 822
 823 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 824 {
 825         struct dentry *dentry;
 826         struct path path;
 827         int err = 0;
 828         /*
 829          * Get the parent directory, calculate the hash for last
 830          * component.
 831          */
 832         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 833         err = PTR_ERR(dentry);
 834         if (IS_ERR(dentry))
 835                 return err;
 836
 837         /*
 838          * All right, let's create it.
 839          */
 840         err = security_path_mknod(&path, dentry, mode, 0);
 841         if (!err) {
 842                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 843                 if (!err) {
 844                         res->mnt = mntget(path.mnt);
 845                         res->dentry = dget(dentry);
 846                 }
 847         }
 848         done_path_create(&path, dentry);
 849         return err;
 850 }
 851
 852 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 853 {
 854         struct sock *sk = sock->sk;
 855         struct net *net = sock_net(sk);
 856         struct unix_sock *u = unix_sk(sk);
 857         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 858         char *sun_path = sunaddr->sun_path;
 859         int err;
 860         unsigned int hash;
 861         struct unix_address *addr;
 862         struct hlist_head *list;
 863
 864         err = -EINVAL;
 865         if (sunaddr->sun_family != AF_UNIX)
 866                 goto out;
 867
 868         if (addr_len == sizeof(short)) {
 869                 err = unix_autobind(sock);
 870                 goto out;
 871         }
 872
 873         err = unix_mkname(sunaddr, addr_len, &hash);
 874         if (err < 0)
 875                 goto out;
 876         addr_len = err;
 877
 878         err = mutex_lock_interruptible(&u->readlock);
 879         if (err)
 880                 goto out;
 881
 882         err = -EINVAL;
 883         if (u->addr)
 884                 goto out_up;
 885
 886         err = -ENOMEM;
 887         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 888         if (!addr)
 889                 goto out_up;
 890
 891         memcpy(addr->name, sunaddr, addr_len);
 892         addr->len = addr_len;
 893         addr->hash = hash ^ sk->sk_type;
 894         atomic_set(&addr->refcnt, 1);
 895
 896         if (sun_path[0]) {
 897                 struct path path;
 898                 umode_t mode = S_IFSOCK |
 899                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 900                 err = unix_mknod(sun_path, mode, &path);
 901                 if (err) {
 902                         if (err == -EEXIST)
 903                                 err = -EADDRINUSE;
 904                         unix_release_addr(addr);
 905                         goto out_up;
 906                 }
 907                 addr->hash = UNIX_HASH_SIZE;
 908                 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
 909                 spin_lock(&unix_table_lock);
 910                 u->path = path;
 911                 list = &unix_socket_table[hash];
 912         } else {
 913                 spin_lock(&unix_table_lock);
 914                 err = -EADDRINUSE;
 915                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 916                                               sk->sk_type, hash)) {
 917                         unix_release_addr(addr);
 918                         goto out_unlock;
 919                 }
 920
 921                 list = &unix_socket_table[addr->hash];
 922         }
 923
 924         err = 0;
 925         __unix_remove_socket(sk);
 926         u->addr = addr;
 927         __unix_insert_socket(list, sk);
 928
 929 out_unlock:
 930         spin_unlock(&unix_table_lock);
 931 out_up:
 932         mutex_unlock(&u->readlock);
 933 out:
 934         return err;
 935 }
 936
 937 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 938 {
 939         if (unlikely(sk1 == sk2) || !sk2) {
 940                 unix_state_lock(sk1);
 941                 return;
 942         }
 943         if (sk1 < sk2) {
 944                 unix_state_lock(sk1);
 945                 unix_state_lock_nested(sk2);
 946         } else {
 947                 unix_state_lock(sk2);
 948                 unix_state_lock_nested(sk1);
 949         }
 950 }
 951
 952 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 953 {
 954         if (unlikely(sk1 == sk2) || !sk2) {
 955                 unix_state_unlock(sk1);
 956                 return;
 957         }
 958         unix_state_unlock(sk1);
 959         unix_state_unlock(sk2);
 960 }
 961
 962 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 963                               int alen, int flags)
 964 {
 965         struct sock *sk = sock->sk;
 966         struct net *net = sock_net(sk);
 967         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 968         struct sock *other;
 969         unsigned int hash;
 970         int err;
 971
 972         if (addr->sa_family != AF_UNSPEC) {
 973                 err = unix_mkname(sunaddr, alen, &hash);
 974                 if (err < 0)
 975                         goto out;
 976                 alen = err;
 977
 978                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 979                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 980                         goto out;
 981
 982 restart:
 983                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 984                 if (!other)
 985                         goto out;
 986
 987                 unix_state_double_lock(sk, other);
 988
 989                 /* Apparently VFS overslept socket death. Retry. */
 990                 if (sock_flag(other, SOCK_DEAD)) {
 991                         unix_state_double_unlock(sk, other);
 992                         sock_put(other);
 993                         goto restart;
 994                 }
 995
 996                 err = -EPERM;
 997                 if (!unix_may_send(sk, other))
 998                         goto out_unlock;
 999
1000                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1001                 if (err)
1002                         goto out_unlock;
1003
1004         } else {
1005                 /*
1006                  *      1003.1g breaking connected state with AF_UNSPEC
1007                  */
1008                 other = NULL;
1009                 unix_state_double_lock(sk, other);
1010         }
1011
1012         /*
1013          * If it was connected, reconnect.
1014          */
1015         if (unix_peer(sk)) {
1016                 struct sock *old_peer = unix_peer(sk);
1017                 unix_peer(sk) = other;
1018                 unix_state_double_unlock(sk, other);
1019
1020                 if (other != old_peer)
1021                         unix_dgram_disconnected(sk, old_peer);
1022                 sock_put(old_peer);
1023         } else {
1024                 unix_peer(sk) = other;
1025                 unix_state_double_unlock(sk, other);
1026         }
1027         return 0;
1028
1029 out_unlock:
1030         unix_state_double_unlock(sk, other);
1031         sock_put(other);
1032 out:
1033         return err;
1034 }
1035
1036 static long unix_wait_for_peer(struct sock *other, long timeo)
1037 {
1038         struct unix_sock *u = unix_sk(other);
1039         int sched;
1040         DEFINE_WAIT(wait);
1041
1042         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1043
1044         sched = !sock_flag(other, SOCK_DEAD) &&
1045                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1046                 unix_recvq_full(other);
1047
1048         unix_state_unlock(other);
1049
1050         if (sched)
1051                 timeo = schedule_timeout(timeo);
1052
1053         finish_wait(&u->peer_wait, &wait);
1054         return timeo;
1055 }
1056
1057 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1058                                int addr_len, int flags)
1059 {
1060         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1061         struct sock *sk = sock->sk;
1062         struct net *net = sock_net(sk);
1063         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1064         struct sock *newsk = NULL;
1065         struct sock *other = NULL;
1066         struct sk_buff *skb = NULL;
1067         unsigned int hash;
1068         int st;
1069         int err;
1070         long timeo;
1071
1072         err = unix_mkname(sunaddr, addr_len, &hash);
1073         if (err < 0)
1074                 goto out;
1075         addr_len = err;
1076
1077         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1078             (err = unix_autobind(sock)) != 0)
1079                 goto out;
1080
1081         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1082
1083         /* First of all allocate resources.
1084            If we will make it after state is locked,
1085            we will have to recheck all again in any case.
1086          */
1087
1088         err = -ENOMEM;
1089
1090         /* create new sock for complete connection */
1091         newsk = unix_create1(sock_net(sk), NULL);
1092         if (newsk == NULL)
1093                 goto out;
1094
1095         /* Allocate skb for sending to listening sock */
1096         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1097         if (skb == NULL)
1098                 goto out;
1099
1100 restart:
1101         /*  Find listening sock. */
1102         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1103         if (!other)
1104                 goto out;
1105
1106         /* Latch state of peer */
1107         unix_state_lock(other);
1108
1109         /* Apparently VFS overslept socket death. Retry. */
1110         if (sock_flag(other, SOCK_DEAD)) {
1111                 unix_state_unlock(other);
1112                 sock_put(other);
1113                 goto restart;
1114         }
1115
1116         err = -ECONNREFUSED;
1117         if (other->sk_state != TCP_LISTEN)
1118                 goto out_unlock;
1119         if (other->sk_shutdown & RCV_SHUTDOWN)
1120                 goto out_unlock;
1121
1122         if (unix_recvq_full(other)) {
1123                 err = -EAGAIN;
1124                 if (!timeo)
1125                         goto out_unlock;
1126
1127                 timeo = unix_wait_for_peer(other, timeo);
1128
1129                 err = sock_intr_errno(timeo);
1130                 if (signal_pending(current))
1131                         goto out;
1132                 sock_put(other);
1133                 goto restart;
1134         }
1135
1136         /* Latch our state.
1137
1138            It is tricky place. We need to grab our state lock and cannot
1139            drop lock on peer. It is dangerous because deadlock is
1140            possible. Connect to self case and simultaneous
1141            attempt to connect are eliminated by checking socket
1142            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1143            check this before attempt to grab lock.
1144
1145            Well, and we have to recheck the state after socket locked.
1146          */
1147         st = sk->sk_state;
1148
1149         switch (st) {
1150         case TCP_CLOSE:
1151                 /* This is ok... continue with connect */
1152                 break;
1153         case TCP_ESTABLISHED:
1154                 /* Socket is already connected */
1155                 err = -EISCONN;
1156                 goto out_unlock;
1157         default:
1158                 err = -EINVAL;
1159                 goto out_unlock;
1160         }
1161
1162         unix_state_lock_nested(sk);
1163
1164         if (sk->sk_state != st) {
1165                 unix_state_unlock(sk);
1166                 unix_state_unlock(other);
1167                 sock_put(other);
1168                 goto restart;
1169         }
1170
1171         err = security_unix_stream_connect(sk, other, newsk);
1172         if (err) {
1173                 unix_state_unlock(sk);
1174                 goto out_unlock;
1175         }
1176
1177         /* The way is open! Fastly set all the necessary fields... */
1178
1179         sock_hold(sk);
1180         unix_peer(newsk)        = sk;
1181         newsk->sk_state         = TCP_ESTABLISHED;
1182         newsk->sk_type          = sk->sk_type;
1183         init_peercred(newsk);
1184         newu = unix_sk(newsk);
1185         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1186         otheru = unix_sk(other);
1187
1188         /* copy address information from listening to new sock*/
1189         if (otheru->addr) {
1190                 atomic_inc(&otheru->addr->refcnt);
1191                 newu->addr = otheru->addr;
1192         }
1193         if (otheru->path.dentry) {
1194                 path_get(&otheru->path);
1195                 newu->path = otheru->path;
1196         }
1197
1198         /* Set credentials */
1199         copy_peercred(sk, other);
1200
1201         sock->state     = SS_CONNECTED;
1202         sk->sk_state    = TCP_ESTABLISHED;
1203         sock_hold(newsk);
1204
1205         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1206         unix_peer(sk)   = newsk;
1207
1208         unix_state_unlock(sk);
1209
1210         /* take ten and and send info to listening sock */
1211         spin_lock(&other->sk_receive_queue.lock);
1212         __skb_queue_tail(&other->sk_receive_queue, skb);
1213         spin_unlock(&other->sk_receive_queue.lock);
1214         unix_state_unlock(other);
1215         other->sk_data_ready(other);
1216         sock_put(other);
1217         return 0;
1218
1219 out_unlock:
1220         if (other)
1221                 unix_state_unlock(other);
1222
1223 out:
1224         kfree_skb(skb);
1225         if (newsk)
1226                 unix_release_sock(newsk, 0);
1227         if (other)
1228                 sock_put(other);
1229         return err;
1230 }
1231
1232 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1233 {
1234         struct sock *ska = socka->sk, *skb = sockb->sk;
1235
1236         /* Join our sockets back to back */
1237         sock_hold(ska);
1238         sock_hold(skb);
1239         unix_peer(ska) = skb;
1240         unix_peer(skb) = ska;
1241         init_peercred(ska);
1242         init_peercred(skb);
1243
1244         if (ska->sk_type != SOCK_DGRAM) {
1245                 ska->sk_state = TCP_ESTABLISHED;
1246                 skb->sk_state = TCP_ESTABLISHED;
1247                 socka->state  = SS_CONNECTED;
1248                 sockb->state  = SS_CONNECTED;
1249         }
1250         return 0;
1251 }
1252
1253 static void unix_sock_inherit_flags(const struct socket *old,
1254                                     struct socket *new)
1255 {
1256         if (test_bit(SOCK_PASSCRED, &old->flags))
1257                 set_bit(SOCK_PASSCRED, &new->flags);
1258         if (test_bit(SOCK_PASSSEC, &old->flags))
1259                 set_bit(SOCK_PASSSEC, &new->flags);
1260 }
1261
1262 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1263 {
1264         struct sock *sk = sock->sk;
1265         struct sock *tsk;
1266         struct sk_buff *skb;
1267         int err;
1268
1269         err = -EOPNOTSUPP;
1270         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1271                 goto out;
1272
1273         err = -EINVAL;
1274         if (sk->sk_state != TCP_LISTEN)
1275                 goto out;
1276
1277         /* If socket state is TCP_LISTEN it cannot change (for now...),
1278          * so that no locks are necessary.
1279          */
1280
1281         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1282         if (!skb) {
1283                 /* This means receive shutdown. */
1284                 if (err == 0)
1285                         err = -EINVAL;
1286                 goto out;
1287         }
1288
1289         tsk = skb->sk;
1290         skb_free_datagram(sk, skb);
1291         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1292
1293         /* attach accepted sock to socket */
1294         unix_state_lock(tsk);
1295         newsock->state = SS_CONNECTED;
1296         unix_sock_inherit_flags(sock, newsock);
1297         sock_graft(tsk, newsock);
1298         unix_state_unlock(tsk);
1299         return 0;
1300
1301 out:
1302         return err;
1303 }
1304
1305
1306 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1307 {
1308         struct sock *sk = sock->sk;
1309         struct unix_sock *u;
1310         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1311         int err = 0;
1312
1313         if (peer) {
1314                 sk = unix_peer_get(sk);
1315
1316                 err = -ENOTCONN;
1317                 if (!sk)
1318                         goto out;
1319                 err = 0;
1320         } else {
1321                 sock_hold(sk);
1322         }
1323
1324         u = unix_sk(sk);
1325         unix_state_lock(sk);
1326         if (!u->addr) {
1327                 sunaddr->sun_family = AF_UNIX;
1328                 sunaddr->sun_path[0] = 0;
1329                 *uaddr_len = sizeof(short);
1330         } else {
1331                 struct unix_address *addr = u->addr;
1332
1333                 *uaddr_len = addr->len;
1334                 memcpy(sunaddr, addr->name, *uaddr_len);
1335         }
1336         unix_state_unlock(sk);
1337         sock_put(sk);
1338 out:
1339         return err;
1340 }
1341
1342 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1343 {
1344         int i;
1345
1346         scm->fp = UNIXCB(skb).fp;
1347         UNIXCB(skb).fp = NULL;
1348
1349         for (i = scm->fp->count-1; i >= 0; i--)
1350                 unix_notinflight(scm->fp->fp[i]);
1351 }
1352
1353 static void unix_destruct_scm(struct sk_buff *skb)
1354 {
1355         struct scm_cookie scm;
1356         memset(&scm, 0, sizeof(scm));
1357         scm.pid  = UNIXCB(skb).pid;
1358         if (UNIXCB(skb).fp)
1359                 unix_detach_fds(&scm, skb);
1360
1361         /* Alas, it calls VFS */
1362         /* So fscking what? fput() had been SMP-safe since the last Summer */
1363         scm_destroy(&scm);
1364         sock_wfree(skb);
1365 }
1366
1367 #define MAX_RECURSION_LEVEL 4
1368
1369 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1370 {
1371         int i;
1372         unsigned char max_level = 0;
1373         int unix_sock_count = 0;
1374
1375         for (i = scm->fp->count - 1; i >= 0; i--) {
1376                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1377
1378                 if (sk) {
1379                         unix_sock_count++;
1380                         max_level = max(max_level,
1381                                         unix_sk(sk)->recursion_level);
1382                 }
1383         }
1384         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1385                 return -ETOOMANYREFS;
1386
1387         /*
1388          * Need to duplicate file references for the sake of garbage
1389          * collection.  Otherwise a socket in the fps might become a
1390          * candidate for GC while the skb is not yet queued.
1391          */
1392         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1393         if (!UNIXCB(skb).fp)
1394                 return -ENOMEM;
1395
1396         if (unix_sock_count) {
1397                 for (i = scm->fp->count - 1; i >= 0; i--)
1398                         unix_inflight(scm->fp->fp[i]);
1399         }
1400         return max_level;
1401 }
1402
1403 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1404 {
1405         int err = 0;
1406
1407         UNIXCB(skb).pid  = get_pid(scm->pid);
1408         UNIXCB(skb).uid = scm->creds.uid;
1409         UNIXCB(skb).gid = scm->creds.gid;
1410         UNIXCB(skb).fp = NULL;
1411         if (scm->fp && send_fds)
1412                 err = unix_attach_fds(scm, skb);
1413
1414         skb->destructor = unix_destruct_scm;
1415         return err;
1416 }
1417
1418 /*
1419  * Some apps rely on write() giving SCM_CREDENTIALS
1420  * We include credentials if source or destination socket
1421  * asserted SOCK_PASSCRED.
1422  */
1423 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1424                             const struct sock *other)
1425 {
1426         if (UNIXCB(skb).pid)
1427                 return;
1428         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1429             !other->sk_socket ||
1430             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1431                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1432                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1433         }
1434 }
1435
1436 /*
1437  *      Send AF_UNIX data.
1438  */
1439
1440 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1441                               size_t len)
1442 {
1443         struct sock *sk = sock->sk;
1444         struct net *net = sock_net(sk);
1445         struct unix_sock *u = unix_sk(sk);
1446         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1447         struct sock *other = NULL;
1448         int namelen = 0; /* fake GCC */
1449         int err;
1450         unsigned int hash;
1451         struct sk_buff *skb;
1452         long timeo;
1453         struct scm_cookie scm;
1454         int max_level;
1455         int data_len = 0;
1456
1457         wait_for_unix_gc();
1458         err = scm_send(sock, msg, &scm, false);
1459         if (err < 0)
1460                 return err;
1461
1462         err = -EOPNOTSUPP;
1463         if (msg->msg_flags&MSG_OOB)
1464                 goto out;
1465
1466         if (msg->msg_namelen) {
1467                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1468                 if (err < 0)
1469                         goto out;
1470                 namelen = err;
1471         } else {
1472                 sunaddr = NULL;
1473                 err = -ENOTCONN;
1474                 other = unix_peer_get(sk);
1475                 if (!other)
1476                         goto out;
1477         }
1478
1479         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1480             && (err = unix_autobind(sock)) != 0)
1481                 goto out;
1482
1483         err = -EMSGSIZE;
1484         if (len > sk->sk_sndbuf - 32)
1485                 goto out;
1486
1487         if (len > SKB_MAX_ALLOC) {
1488                 data_len = min_t(size_t,
1489                                  len - SKB_MAX_ALLOC,
1490                                  MAX_SKB_FRAGS * PAGE_SIZE);
1491                 data_len = PAGE_ALIGN(data_len);
1492
1493                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1494         }
1495
1496         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1497                                    msg->msg_flags & MSG_DONTWAIT, &err,
1498                                    PAGE_ALLOC_COSTLY_ORDER);
1499         if (skb == NULL)
1500                 goto out;
1501
1502         err = unix_scm_to_skb(&scm, skb, true);
1503         if (err < 0)
1504                 goto out_free;
1505         max_level = err + 1;
1506         unix_get_secdata(&scm, skb);
1507
1508         skb_put(skb, len - data_len);
1509         skb->data_len = data_len;
1510         skb->len = len;
1511         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1512         if (err)
1513                 goto out_free;
1514
1515         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1516
1517 restart:
1518         if (!other) {
1519                 err = -ECONNRESET;
1520                 if (sunaddr == NULL)
1521                         goto out_free;
1522
1523                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1524                                         hash, &err);
1525                 if (other == NULL)
1526                         goto out_free;
1527         }
1528
1529         if (sk_filter(other, skb) < 0) {
1530                 /* Toss the packet but do not return any error to the sender */
1531                 err = len;
1532                 goto out_free;
1533         }
1534
1535         unix_state_lock(other);
1536         err = -EPERM;
1537         if (!unix_may_send(sk, other))
1538                 goto out_unlock;
1539
1540         if (sock_flag(other, SOCK_DEAD)) {
1541                 /*
1542                  *      Check with 1003.1g - what should
1543                  *      datagram error
1544                  */
1545                 unix_state_unlock(other);
1546                 sock_put(other);
1547
1548                 err = 0;
1549                 unix_state_lock(sk);
1550                 if (unix_peer(sk) == other) {
1551                         unix_peer(sk) = NULL;
1552                         unix_state_unlock(sk);
1553
1554                         unix_dgram_disconnected(sk, other);
1555                         sock_put(other);
1556                         err = -ECONNREFUSED;
1557                 } else {
1558                         unix_state_unlock(sk);
1559                 }
1560
1561                 other = NULL;
1562                 if (err)
1563                         goto out_free;
1564                 goto restart;
1565         }
1566
1567         err = -EPIPE;
1568         if (other->sk_shutdown & RCV_SHUTDOWN)
1569                 goto out_unlock;
1570
1571         if (sk->sk_type != SOCK_SEQPACKET) {
1572                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1573                 if (err)
1574                         goto out_unlock;
1575         }
1576
1577         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1578                 if (!timeo) {
1579                         err = -EAGAIN;
1580                         goto out_unlock;
1581                 }
1582
1583                 timeo = unix_wait_for_peer(other, timeo);
1584
1585                 err = sock_intr_errno(timeo);
1586                 if (signal_pending(current))
1587                         goto out_free;
1588
1589                 goto restart;
1590         }
1591
1592         if (sock_flag(other, SOCK_RCVTSTAMP))
1593                 __net_timestamp(skb);
1594         maybe_add_creds(skb, sock, other);
1595         skb_queue_tail(&other->sk_receive_queue, skb);
1596         if (max_level > unix_sk(other)->recursion_level)
1597                 unix_sk(other)->recursion_level = max_level;
1598         unix_state_unlock(other);
1599         other->sk_data_ready(other);
1600         sock_put(other);
1601         scm_destroy(&scm);
1602         return len;
1603
1604 out_unlock:
1605         unix_state_unlock(other);
1606 out_free:
1607         kfree_skb(skb);
1608 out:
1609         if (other)
1610                 sock_put(other);
1611         scm_destroy(&scm);
1612         return err;
1613 }
1614
1615 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1616  * bytes, and a minimun of a full page.
1617  */
1618 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1619
1620 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1621                                size_t len)
1622 {
1623         struct sock *sk = sock->sk;
1624         struct sock *other = NULL;
1625         int err, size;
1626         struct sk_buff *skb;
1627         int sent = 0;
1628         struct scm_cookie scm;
1629         bool fds_sent = false;
1630         int max_level;
1631         int data_len;
1632
1633         wait_for_unix_gc();
1634         err = scm_send(sock, msg, &scm, false);
1635         if (err < 0)
1636                 return err;
1637
1638         err = -EOPNOTSUPP;
1639         if (msg->msg_flags&MSG_OOB)
1640                 goto out_err;
1641
1642         if (msg->msg_namelen) {
1643                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1644                 goto out_err;
1645         } else {
1646                 err = -ENOTCONN;
1647                 other = unix_peer(sk);
1648                 if (!other)
1649                         goto out_err;
1650         }
1651
1652         if (sk->sk_shutdown & SEND_SHUTDOWN)
1653                 goto pipe_err;
1654
1655         while (sent < len) {
1656                 size = len - sent;
1657
1658                 /* Keep two messages in the pipe so it schedules better */
1659                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1660
1661                 /* allow fallback to order-0 allocations */
1662                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1663
1664                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1665
1666                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1667
1668                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1669                                            msg->msg_flags & MSG_DONTWAIT, &err,
1670                                            get_order(UNIX_SKB_FRAGS_SZ));
1671                 if (!skb)
1672                         goto out_err;
1673
1674                 /* Only send the fds in the first buffer */
1675                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1676                 if (err < 0) {
1677                         kfree_skb(skb);
1678                         goto out_err;
1679                 }
1680                 max_level = err + 1;
1681                 fds_sent = true;
1682
1683                 skb_put(skb, size - data_len);
1684                 skb->data_len = data_len;
1685                 skb->len = size;
1686                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1687                 if (err) {
1688                         kfree_skb(skb);
1689                         goto out_err;
1690                 }
1691
1692                 unix_state_lock(other);
1693
1694                 if (sock_flag(other, SOCK_DEAD) ||
1695                     (other->sk_shutdown & RCV_SHUTDOWN))
1696                         goto pipe_err_free;
1697
1698                 maybe_add_creds(skb, sock, other);
1699                 skb_queue_tail(&other->sk_receive_queue, skb);
1700                 if (max_level > unix_sk(other)->recursion_level)
1701                         unix_sk(other)->recursion_level = max_level;
1702                 unix_state_unlock(other);
1703                 other->sk_data_ready(other);
1704                 sent += size;
1705         }
1706
1707         scm_destroy(&scm);
1708
1709         return sent;
1710
1711 pipe_err_free:
1712         unix_state_unlock(other);
1713         kfree_skb(skb);
1714 pipe_err:
1715         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1716                 send_sig(SIGPIPE, current, 0);
1717         err = -EPIPE;
1718 out_err:
1719         scm_destroy(&scm);
1720         return sent ? : err;
1721 }
1722
1723 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1724                                   size_t len)
1725 {
1726         int err;
1727         struct sock *sk = sock->sk;
1728
1729         err = sock_error(sk);
1730         if (err)
1731                 return err;
1732
1733         if (sk->sk_state != TCP_ESTABLISHED)
1734                 return -ENOTCONN;
1735
1736         if (msg->msg_namelen)
1737                 msg->msg_namelen = 0;
1738
1739         return unix_dgram_sendmsg(sock, msg, len);
1740 }
1741
1742 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1743                                   size_t size, int flags)
1744 {
1745         struct sock *sk = sock->sk;
1746
1747         if (sk->sk_state != TCP_ESTABLISHED)
1748                 return -ENOTCONN;
1749
1750         return unix_dgram_recvmsg(sock, msg, size, flags);
1751 }
1752
1753 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754 {
1755         struct unix_sock *u = unix_sk(sk);
1756
1757         if (u->addr) {
1758                 msg->msg_namelen = u->addr->len;
1759                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760         }
1761 }
1762
1763 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1764                               size_t size, int flags)
1765 {
1766         struct scm_cookie scm;
1767         struct sock *sk = sock->sk;
1768         struct unix_sock *u = unix_sk(sk);
1769         int noblock = flags & MSG_DONTWAIT;
1770         struct sk_buff *skb;
1771         int err;
1772         int peeked, skip;
1773
1774         err = -EOPNOTSUPP;
1775         if (flags&MSG_OOB)
1776                 goto out;
1777
1778         err = mutex_lock_interruptible(&u->readlock);
1779         if (unlikely(err)) {
1780                 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
1781                  * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1782                  */
1783                 err = noblock ? -EAGAIN : -ERESTARTSYS;
1784                 goto out;
1785         }
1786
1787         skip = sk_peek_offset(sk, flags);
1788
1789         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1790         if (!skb) {
1791                 unix_state_lock(sk);
1792                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1793                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1794                     (sk->sk_shutdown & RCV_SHUTDOWN))
1795                         err = 0;
1796                 unix_state_unlock(sk);
1797                 goto out_unlock;
1798         }
1799
1800         wake_up_interruptible_sync_poll(&u->peer_wait,
1801                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1802
1803         if (msg->msg_name)
1804                 unix_copy_addr(msg, skb->sk);
1805
1806         if (size > skb->len - skip)
1807                 size = skb->len - skip;
1808         else if (size < skb->len - skip)
1809                 msg->msg_flags |= MSG_TRUNC;
1810
1811         err = skb_copy_datagram_msg(skb, skip, msg, size);
1812         if (err)
1813                 goto out_free;
1814
1815         if (sock_flag(sk, SOCK_RCVTSTAMP))
1816                 __sock_recv_timestamp(msg, sk, skb);
1817
1818         memset(&scm, 0, sizeof(scm));
1819
1820         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1821         unix_set_secdata(&scm, skb);
1822
1823         if (!(flags & MSG_PEEK)) {
1824                 if (UNIXCB(skb).fp)
1825                         unix_detach_fds(&scm, skb);
1826
1827                 sk_peek_offset_bwd(sk, skb->len);
1828         } else {
1829                 /* It is questionable: on PEEK we could:
1830                    - do not return fds - good, but too simple 8)
1831                    - return fds, and do not return them on read (old strategy,
1832                      apparently wrong)
1833                    - clone fds (I chose it for now, it is the most universal
1834                      solution)
1835
1836                    POSIX 1003.1g does not actually define this clearly
1837                    at all. POSIX 1003.1g doesn't define a lot of things
1838                    clearly however!
1839
1840                 */
1841
1842                 sk_peek_offset_fwd(sk, size);
1843
1844                 if (UNIXCB(skb).fp)
1845                         scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1846         }
1847         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1848
1849         scm_recv(sock, msg, &scm, flags);
1850
1851 out_free:
1852         skb_free_datagram(sk, skb);
1853 out_unlock:
1854         mutex_unlock(&u->readlock);
1855 out:
1856         return err;
1857 }
1858
1859 /*
1860  *      Sleep until more data has arrived. But check for races..
1861  */
1862 static long unix_stream_data_wait(struct sock *sk, long timeo,
1863                                   struct sk_buff *last)
1864 {
1865         DEFINE_WAIT(wait);
1866
1867         unix_state_lock(sk);
1868
1869         for (;;) {
1870                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1871
1872                 if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1873                     sk->sk_err ||
1874                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1875                     signal_pending(current) ||
1876                     !timeo)
1877                         break;
1878
1879                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1880                 unix_state_unlock(sk);
1881                 timeo = freezable_schedule_timeout(timeo);
1882                 unix_state_lock(sk);
1883                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1884         }
1885
1886         finish_wait(sk_sleep(sk), &wait);
1887         unix_state_unlock(sk);
1888         return timeo;
1889 }
1890
1891 static unsigned int unix_skb_len(const struct sk_buff *skb)
1892 {
1893         return skb->len - UNIXCB(skb).consumed;
1894 }
1895
1896 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
1897                                size_t size, int flags)
1898 {
1899         struct scm_cookie scm;
1900         struct sock *sk = sock->sk;
1901         struct unix_sock *u = unix_sk(sk);
1902         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903         int copied = 0;
1904         int noblock = flags & MSG_DONTWAIT;
1905         int check_creds = 0;
1906         int target;
1907         int err = 0;
1908         long timeo;
1909         int skip;
1910
1911         err = -EINVAL;
1912         if (sk->sk_state != TCP_ESTABLISHED)
1913                 goto out;
1914
1915         err = -EOPNOTSUPP;
1916         if (flags&MSG_OOB)
1917                 goto out;
1918
1919         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1920         timeo = sock_rcvtimeo(sk, noblock);
1921
1922         /* Lock the socket to prevent queue disordering
1923          * while sleeps in memcpy_tomsg
1924          */
1925
1926         memset(&scm, 0, sizeof(scm));
1927
1928         err = mutex_lock_interruptible(&u->readlock);
1929         if (unlikely(err)) {
1930                 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
1931                  * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1932                  */
1933                 err = noblock ? -EAGAIN : -ERESTARTSYS;
1934                 goto out;
1935         }
1936
1937         do {
1938                 int chunk;
1939                 struct sk_buff *skb, *last;
1940
1941                 unix_state_lock(sk);
1942                 last = skb = skb_peek(&sk->sk_receive_queue);
1943 again:
1944                 if (skb == NULL) {
1945                         unix_sk(sk)->recursion_level = 0;
1946                         if (copied >= target)
1947                                 goto unlock;
1948
1949                         /*
1950                          *      POSIX 1003.1g mandates this order.
1951                          */
1952
1953                         err = sock_error(sk);
1954                         if (err)
1955                                 goto unlock;
1956                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1957                                 goto unlock;
1958
1959                         unix_state_unlock(sk);
1960                         err = -EAGAIN;
1961                         if (!timeo)
1962                                 break;
1963                         mutex_unlock(&u->readlock);
1964
1965                         timeo = unix_stream_data_wait(sk, timeo, last);
1966
1967                         if (signal_pending(current)
1968                             ||  mutex_lock_interruptible(&u->readlock)) {
1969                                 err = sock_intr_errno(timeo);
1970                                 goto out;
1971                         }
1972
1973                         continue;
1974  unlock:
1975                         unix_state_unlock(sk);
1976                         break;
1977                 }
1978
1979                 skip = sk_peek_offset(sk, flags);
1980                 while (skip >= unix_skb_len(skb)) {
1981                         skip -= unix_skb_len(skb);
1982                         last = skb;
1983                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
1984                         if (!skb)
1985                                 goto again;
1986                 }
1987
1988                 unix_state_unlock(sk);
1989
1990                 if (check_creds) {
1991                         /* Never glue messages from different writers */
1992                         if ((UNIXCB(skb).pid  != scm.pid) ||
1993                             !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
1994                             !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
1995                                 break;
1996                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1997                         /* Copy credentials */
1998                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1999                         check_creds = 1;
2000                 }
2001
2002                 /* Copy address just once */
2003                 if (sunaddr) {
2004                         unix_copy_addr(msg, skb->sk);
2005                         sunaddr = NULL;
2006                 }
2007
2008                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2009                 if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2010                                           msg, chunk)) {
2011                         if (copied == 0)
2012                                 copied = -EFAULT;
2013                         break;
2014                 }
2015                 copied += chunk;
2016                 size -= chunk;
2017
2018                 /* Mark read part of skb as used */
2019                 if (!(flags & MSG_PEEK)) {
2020                         UNIXCB(skb).consumed += chunk;
2021
2022                         sk_peek_offset_bwd(sk, chunk);
2023
2024                         if (UNIXCB(skb).fp)
2025                                 unix_detach_fds(&scm, skb);
2026
2027                         if (unix_skb_len(skb))
2028                                 break;
2029
2030                         skb_unlink(skb, &sk->sk_receive_queue);
2031                         consume_skb(skb);
2032
2033                         if (scm.fp)
2034                                 break;
2035                 } else {
2036                         /* It is questionable, see note in unix_dgram_recvmsg.
2037                          */
2038                         if (UNIXCB(skb).fp)
2039                                 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2040
2041                         sk_peek_offset_fwd(sk, chunk);
2042
2043                         break;
2044                 }
2045         } while (size);
2046
2047         mutex_unlock(&u->readlock);
2048         scm_recv(sock, msg, &scm, flags);
2049 out:
2050         return copied ? : err;
2051 }
2052
2053 static int unix_shutdown(struct socket *sock, int mode)
2054 {
2055         struct sock *sk = sock->sk;
2056         struct sock *other;
2057
2058         if (mode < SHUT_RD || mode > SHUT_RDWR)
2059                 return -EINVAL;
2060         /* This maps:
2061          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2062          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2063          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2064          */
2065         ++mode;
2066
2067         unix_state_lock(sk);
2068         sk->sk_shutdown |= mode;
2069         other = unix_peer(sk);
2070         if (other)
2071                 sock_hold(other);
2072         unix_state_unlock(sk);
2073         sk->sk_state_change(sk);
2074
2075         if (other &&
2076                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2077
2078                 int peer_mode = 0;
2079
2080                 if (mode&RCV_SHUTDOWN)
2081                         peer_mode |= SEND_SHUTDOWN;
2082                 if (mode&SEND_SHUTDOWN)
2083                         peer_mode |= RCV_SHUTDOWN;
2084                 unix_state_lock(other);
2085                 other->sk_shutdown |= peer_mode;
2086                 unix_state_unlock(other);
2087                 other->sk_state_change(other);
2088                 if (peer_mode == SHUTDOWN_MASK)
2089                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2090                 else if (peer_mode & RCV_SHUTDOWN)
2091                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2092         }
2093         if (other)
2094                 sock_put(other);
2095
2096         return 0;
2097 }
2098
2099 long unix_inq_len(struct sock *sk)
2100 {
2101         struct sk_buff *skb;
2102         long amount = 0;
2103
2104         if (sk->sk_state == TCP_LISTEN)
2105                 return -EINVAL;
2106
2107         spin_lock(&sk->sk_receive_queue.lock);
2108         if (sk->sk_type == SOCK_STREAM ||
2109             sk->sk_type == SOCK_SEQPACKET) {
2110                 skb_queue_walk(&sk->sk_receive_queue, skb)
2111                         amount += unix_skb_len(skb);
2112         } else {
2113                 skb = skb_peek(&sk->sk_receive_queue);
2114                 if (skb)
2115                         amount = skb->len;
2116         }
2117         spin_unlock(&sk->sk_receive_queue.lock);
2118
2119         return amount;
2120 }
2121 EXPORT_SYMBOL_GPL(unix_inq_len);
2122
2123 long unix_outq_len(struct sock *sk)
2124 {
2125         return sk_wmem_alloc_get(sk);
2126 }
2127 EXPORT_SYMBOL_GPL(unix_outq_len);
2128
2129 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2130 {
2131         struct sock *sk = sock->sk;
2132         long amount = 0;
2133         int err;
2134
2135         switch (cmd) {
2136         case SIOCOUTQ:
2137                 amount = unix_outq_len(sk);
2138                 err = put_user(amount, (int __user *)arg);
2139                 break;
2140         case SIOCINQ:
2141                 amount = unix_inq_len(sk);
2142                 if (amount < 0)
2143                         err = amount;
2144                 else
2145                         err = put_user(amount, (int __user *)arg);
2146                 break;
2147         default:
2148                 err = -ENOIOCTLCMD;
2149                 break;
2150         }
2151         return err;
2152 }
2153
2154 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2155 {
2156         struct sock *sk = sock->sk;
2157         unsigned int mask;
2158
2159         sock_poll_wait(file, sk_sleep(sk), wait);
2160         mask = 0;
2161
2162         /* exceptional events? */
2163         if (sk->sk_err)
2164                 mask |= POLLERR;
2165         if (sk->sk_shutdown == SHUTDOWN_MASK)
2166                 mask |= POLLHUP;
2167         if (sk->sk_shutdown & RCV_SHUTDOWN)
2168                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2169
2170         /* readable? */
2171         if (!skb_queue_empty(&sk->sk_receive_queue))
2172                 mask |= POLLIN | POLLRDNORM;
2173
2174         /* Connection-based need to check for termination and startup */
2175         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2176             sk->sk_state == TCP_CLOSE)
2177                 mask |= POLLHUP;
2178
2179         /*
2180          * we set writable also when the other side has shut down the
2181          * connection. This prevents stuck sockets.
2182          */
2183         if (unix_writable(sk))
2184                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2185
2186         return mask;
2187 }
2188
2189 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2190                                     poll_table *wait)
2191 {
2192         struct sock *sk = sock->sk, *other;
2193         unsigned int mask, writable;
2194
2195         sock_poll_wait(file, sk_sleep(sk), wait);
2196         mask = 0;
2197
2198         /* exceptional events? */
2199         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2200                 mask |= POLLERR |
2201                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2202
2203         if (sk->sk_shutdown & RCV_SHUTDOWN)
2204                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2205         if (sk->sk_shutdown == SHUTDOWN_MASK)
2206                 mask |= POLLHUP;
2207
2208         /* readable? */
2209         if (!skb_queue_empty(&sk->sk_receive_queue))
2210                 mask |= POLLIN | POLLRDNORM;
2211
2212         /* Connection-based need to check for termination and startup */
2213         if (sk->sk_type == SOCK_SEQPACKET) {
2214                 if (sk->sk_state == TCP_CLOSE)
2215                         mask |= POLLHUP;
2216                 /* connection hasn't started yet? */
2217                 if (sk->sk_state == TCP_SYN_SENT)
2218                         return mask;
2219         }
2220
2221         /* No write status requested, avoid expensive OUT tests. */
2222         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2223                 return mask;
2224
2225         writable = unix_writable(sk);
2226         other = unix_peer_get(sk);
2227         if (other) {
2228                 if (unix_peer(other) != sk) {
2229                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2230                         if (unix_recvq_full(other))
2231                                 writable = 0;
2232                 }
2233                 sock_put(other);
2234         }
2235
2236         if (writable)
2237                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2238         else
2239                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2240
2241         return mask;
2242 }
2243
2244 #ifdef CONFIG_PROC_FS
2245
2246 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2247
2248 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2249 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2250 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2251
2252 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2253 {
2254         unsigned long offset = get_offset(*pos);
2255         unsigned long bucket = get_bucket(*pos);
2256         struct sock *sk;
2257         unsigned long count = 0;
2258
2259         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2260                 if (sock_net(sk) != seq_file_net(seq))
2261                         continue;
2262                 if (++count == offset)
2263                         break;
2264         }
2265
2266         return sk;
2267 }
2268
2269 static struct sock *unix_next_socket(struct seq_file *seq,
2270                                      struct sock *sk,
2271                                      loff_t *pos)
2272 {
2273         unsigned long bucket;
2274
2275         while (sk > (struct sock *)SEQ_START_TOKEN) {
2276                 sk = sk_next(sk);
2277                 if (!sk)
2278                         goto next_bucket;
2279                 if (sock_net(sk) == seq_file_net(seq))
2280                         return sk;
2281         }
2282
2283         do {
2284                 sk = unix_from_bucket(seq, pos);
2285                 if (sk)
2286                         return sk;
2287
2288 next_bucket:
2289                 bucket = get_bucket(*pos) + 1;
2290                 *pos = set_bucket_offset(bucket, 1);
2291         } while (bucket < ARRAY_SIZE(unix_socket_table));
2292
2293         return NULL;
2294 }
2295
2296 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2297         __acquires(unix_table_lock)
2298 {
2299         spin_lock(&unix_table_lock);
2300
2301         if (!*pos)
2302                 return SEQ_START_TOKEN;
2303
2304         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2305                 return NULL;
2306
2307         return unix_next_socket(seq, NULL, pos);
2308 }
2309
2310 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2311 {
2312         ++*pos;
2313         return unix_next_socket(seq, v, pos);
2314 }
2315
2316 static void unix_seq_stop(struct seq_file *seq, void *v)
2317         __releases(unix_table_lock)
2318 {
2319         spin_unlock(&unix_table_lock);
2320 }
2321
2322 static int unix_seq_show(struct seq_file *seq, void *v)
2323 {
2324
2325         if (v == SEQ_START_TOKEN)
2326                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2327                          "Inode Path\n");
2328         else {
2329                 struct sock *s = v;
2330                 struct unix_sock *u = unix_sk(s);
2331                 unix_state_lock(s);
2332
2333                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2334                         s,
2335                         atomic_read(&s->sk_refcnt),
2336                         0,
2337                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2338                         s->sk_type,
2339                         s->sk_socket ?
2340                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2341                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2342                         sock_i_ino(s));
2343
2344                 if (u->addr) {
2345                         int i, len;
2346                         seq_putc(seq, ' ');
2347
2348                         i = 0;
2349                         len = u->addr->len - sizeof(short);
2350                         if (!UNIX_ABSTRACT(s))
2351                                 len--;
2352                         else {
2353                                 seq_putc(seq, '@');
2354                                 i++;
2355                         }
2356                         for ( ; i < len; i++)
2357                                 seq_putc(seq, u->addr->name->sun_path[i]);
2358                 }
2359                 unix_state_unlock(s);
2360                 seq_putc(seq, '\n');
2361         }
2362
2363         return 0;
2364 }
2365
2366 static const struct seq_operations unix_seq_ops = {
2367         .start  = unix_seq_start,
2368         .next   = unix_seq_next,
2369         .stop   = unix_seq_stop,
2370         .show   = unix_seq_show,
2371 };
2372
2373 static int unix_seq_open(struct inode *inode, struct file *file)
2374 {
2375         return seq_open_net(inode, file, &unix_seq_ops,
2376                             sizeof(struct seq_net_private));
2377 }
2378
2379 static const struct file_operations unix_seq_fops = {
2380         .owner          = THIS_MODULE,
2381         .open           = unix_seq_open,
2382         .read           = seq_read,
2383         .llseek         = seq_lseek,
2384         .release        = seq_release_net,
2385 };
2386
2387 #endif
2388
2389 static const struct net_proto_family unix_family_ops = {
2390         .family = PF_UNIX,
2391         .create = unix_create,
2392         .owner  = THIS_MODULE,
2393 };
2394
2395
2396 static int __net_init unix_net_init(struct net *net)
2397 {
2398         int error = -ENOMEM;
2399
2400         net->unx.sysctl_max_dgram_qlen = 10;
2401         if (unix_sysctl_register(net))
2402                 goto out;
2403
2404 #ifdef CONFIG_PROC_FS
2405         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2406                 unix_sysctl_unregister(net);
2407                 goto out;
2408         }
2409 #endif
2410         error = 0;
2411 out:
2412         return error;
2413 }
2414
2415 static void __net_exit unix_net_exit(struct net *net)
2416 {
2417         unix_sysctl_unregister(net);
2418         remove_proc_entry("unix", net->proc_net);
2419 }
2420
2421 static struct pernet_operations unix_net_ops = {
2422         .init = unix_net_init,
2423         .exit = unix_net_exit,
2424 };
2425
2426 static int __init af_unix_init(void)
2427 {
2428         int rc = -1;
2429
2430         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2431
2432         rc = proto_register(&unix_proto, 1);
2433         if (rc != 0) {
2434                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2435                 goto out;
2436         }
2437
2438         sock_register(&unix_family_ops);
2439         register_pernet_subsys(&unix_net_ops);
2440 out:
2441         return rc;
2442 }
2443
2444 static void __exit af_unix_exit(void)
2445 {
2446         sock_unregister(PF_UNIX);
2447         proto_unregister(&unix_proto);
2448         unregister_pernet_subsys(&unix_net_ops);
2449 }
2450
2451 /* Earlier than device_initcall() so that other drivers invoking
2452    request_module() don't end up in a loop when modprobe tries
2453    to use a UNIX socket. But later than subsys_initcall() because
2454    we depend on stuff initialised there */
2455 fs_initcall(af_unix_init);
2456 module_exit(af_unix_exit);
2457
2458 MODULE_LICENSE("GPL");
2459 MODULE_ALIAS_NETPROTO(PF_UNIX);