]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/core/sock.c
net: Fix for dst_negative_advice
[karo-tx-linux.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Florian La Roche, <flla@stud.uni-sb.de>
13  *              Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *              Alan Cox        :       Numerous verify_area() problems
17  *              Alan Cox        :       Connecting on a connecting socket
18  *                                      now returns an error for tcp.
19  *              Alan Cox        :       sock->protocol is set correctly.
20  *                                      and is not sometimes left as 0.
21  *              Alan Cox        :       connect handles icmp errors on a
22  *                                      connect properly. Unfortunately there
23  *                                      is a restart syscall nasty there. I
24  *                                      can't match BSD without hacking the C
25  *                                      library. Ideas urgently sought!
26  *              Alan Cox        :       Disallow bind() to addresses that are
27  *                                      not ours - especially broadcast ones!!
28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
30  *                                      instead they leave that for the DESTROY timer.
31  *              Alan Cox        :       Clean up error flag in accept
32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
33  *                                      was buggy. Put a remove_sock() in the handler
34  *                                      for memory when we hit 0. Also altered the timer
35  *                                      code. The ACK stuff can wait and needs major
36  *                                      TCP layer surgery.
37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
38  *                                      and fixed timer/inet_bh race.
39  *              Alan Cox        :       Added zapped flag for TCP
40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
47  *      Pauline Middelink       :       identd support
48  *              Alan Cox        :       Fixed connect() taking signals I think.
49  *              Alan Cox        :       SO_LINGER supported
50  *              Alan Cox        :       Error reporting fixes
51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
52  *              Alan Cox        :       inet sockets don't set sk->type!
53  *              Alan Cox        :       Split socket option code
54  *              Alan Cox        :       Callbacks
55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
56  *              Alex            :       Removed restriction on inet fioctl
57  *              Alan Cox        :       Splitting INET from NET core
58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
60  *              Alan Cox        :       Split IP from generic code
61  *              Alan Cox        :       New kfree_skbmem()
62  *              Alan Cox        :       Make SO_DEBUG superuser only.
63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
64  *                                      (compatibility fix)
65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
66  *              Alan Cox        :       Allocator for a socket is settable.
67  *              Alan Cox        :       SO_ERROR includes soft errors.
68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
69  *              Alan Cox        :       Generic socket allocation to make hooks
70  *                                      easier (suggested by Craig Metz).
71  *              Michael Pall    :       SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
79  *              Andi Kleen      :       Fix write_space callback
80  *              Chris Evans     :       Security fixes - signedness again
81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *              This program is free software; you can redistribute it and/or
87  *              modify it under the terms of the GNU General Public License
88  *              as published by the Free Software Foundation; either version
89  *              2 of the License, or (at your option) any later version.
90  */
91
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /*
134  * Each address family might have different locking rules, so we have
135  * one slock key per address family:
136  */
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140 /*
141  * Make lock validator output more readable. (we pre-construct these
142  * strings build-time, so that runtime initialization of socket
143  * locks is fast):
144  */
145 static const char *const af_family_key_strings[AF_MAX+1] = {
146   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158   "sk_lock-AF_IEEE802154",
159   "sk_lock-AF_MAX"
160 };
161 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
174   "slock-AF_IEEE802154",
175   "slock-AF_MAX"
176 };
177 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
179   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
180   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
181   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
182   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
183   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
184   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
185   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
186   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
187   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
188   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
189   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
190   "clock-AF_IEEE802154",
191   "clock-AF_MAX"
192 };
193
194 /*
195  * sk_callback_lock locking rules are per-address-family,
196  * so split the lock classes by using a per-AF key:
197  */
198 static struct lock_class_key af_callback_keys[AF_MAX];
199
200 /* Take into consideration the size of the struct sk_buff overhead in the
201  * determination of these values, since that is non-constant across
202  * platforms.  This makes socket queueing behavior and performance
203  * not depend upon such differences.
204  */
205 #define _SK_MEM_PACKETS         256
206 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
207 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210 /* Run time adjustable parameters. */
211 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
215
216 /* Maximal space eaten by iovec or ancilliary data plus some space */
217 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218 EXPORT_SYMBOL(sysctl_optmem_max);
219
220 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221 {
222         struct timeval tv;
223
224         if (optlen < sizeof(tv))
225                 return -EINVAL;
226         if (copy_from_user(&tv, optval, sizeof(tv)))
227                 return -EFAULT;
228         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229                 return -EDOM;
230
231         if (tv.tv_sec < 0) {
232                 static int warned __read_mostly;
233
234                 *timeo_p = 0;
235                 if (warned < 10 && net_ratelimit()) {
236                         warned++;
237                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238                                "tries to set negative timeout\n",
239                                 current->comm, task_pid_nr(current));
240                 }
241                 return 0;
242         }
243         *timeo_p = MAX_SCHEDULE_TIMEOUT;
244         if (tv.tv_sec == 0 && tv.tv_usec == 0)
245                 return 0;
246         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248         return 0;
249 }
250
251 static void sock_warn_obsolete_bsdism(const char *name)
252 {
253         static int warned;
254         static char warncomm[TASK_COMM_LEN];
255         if (strcmp(warncomm, current->comm) && warned < 5) {
256                 strcpy(warncomm,  current->comm);
257                 printk(KERN_WARNING "process `%s' is using obsolete "
258                        "%s SO_BSDCOMPAT\n", warncomm, name);
259                 warned++;
260         }
261 }
262
263 static void sock_disable_timestamp(struct sock *sk, int flag)
264 {
265         if (sock_flag(sk, flag)) {
266                 sock_reset_flag(sk, flag);
267                 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268                     !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269                         net_disable_timestamp();
270                 }
271         }
272 }
273
274
275 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276 {
277         int err;
278         int skb_len;
279         unsigned long flags;
280         struct sk_buff_head *list = &sk->sk_receive_queue;
281
282         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
283            number of warnings when compiling with -W --ANK
284          */
285         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286             (unsigned)sk->sk_rcvbuf) {
287                 atomic_inc(&sk->sk_drops);
288                 return -ENOMEM;
289         }
290
291         err = sk_filter(sk, skb);
292         if (err)
293                 return err;
294
295         if (!sk_rmem_schedule(sk, skb->truesize)) {
296                 atomic_inc(&sk->sk_drops);
297                 return -ENOBUFS;
298         }
299
300         skb->dev = NULL;
301         skb_set_owner_r(skb, sk);
302
303         /* Cache the SKB length before we tack it onto the receive
304          * queue.  Once it is added it no longer belongs to us and
305          * may be freed by other threads of control pulling packets
306          * from the queue.
307          */
308         skb_len = skb->len;
309
310         spin_lock_irqsave(&list->lock, flags);
311         skb->dropcount = atomic_read(&sk->sk_drops);
312         __skb_queue_tail(list, skb);
313         spin_unlock_irqrestore(&list->lock, flags);
314
315         if (!sock_flag(sk, SOCK_DEAD))
316                 sk->sk_data_ready(sk, skb_len);
317         return 0;
318 }
319 EXPORT_SYMBOL(sock_queue_rcv_skb);
320
321 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
322 {
323         int rc = NET_RX_SUCCESS;
324
325         if (sk_filter(sk, skb))
326                 goto discard_and_relse;
327
328         skb->dev = NULL;
329
330         if (nested)
331                 bh_lock_sock_nested(sk);
332         else
333                 bh_lock_sock(sk);
334         if (!sock_owned_by_user(sk)) {
335                 /*
336                  * trylock + unlock semantics:
337                  */
338                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
339
340                 rc = sk_backlog_rcv(sk, skb);
341
342                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
343         } else
344                 sk_add_backlog(sk, skb);
345         bh_unlock_sock(sk);
346 out:
347         sock_put(sk);
348         return rc;
349 discard_and_relse:
350         kfree_skb(skb);
351         goto out;
352 }
353 EXPORT_SYMBOL(sk_receive_skb);
354
355 void sk_reset_txq(struct sock *sk)
356 {
357         sk_tx_queue_clear(sk);
358 }
359 EXPORT_SYMBOL(sk_reset_txq);
360
361 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
362 {
363         struct dst_entry *dst = sk->sk_dst_cache;
364
365         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
366                 sk_tx_queue_clear(sk);
367                 sk->sk_dst_cache = NULL;
368                 dst_release(dst);
369                 return NULL;
370         }
371
372         return dst;
373 }
374 EXPORT_SYMBOL(__sk_dst_check);
375
376 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
377 {
378         struct dst_entry *dst = sk_dst_get(sk);
379
380         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
381                 sk_dst_reset(sk);
382                 dst_release(dst);
383                 return NULL;
384         }
385
386         return dst;
387 }
388 EXPORT_SYMBOL(sk_dst_check);
389
390 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
391 {
392         int ret = -ENOPROTOOPT;
393 #ifdef CONFIG_NETDEVICES
394         struct net *net = sock_net(sk);
395         char devname[IFNAMSIZ];
396         int index;
397
398         /* Sorry... */
399         ret = -EPERM;
400         if (!capable(CAP_NET_RAW))
401                 goto out;
402
403         ret = -EINVAL;
404         if (optlen < 0)
405                 goto out;
406
407         /* Bind this socket to a particular device like "eth0",
408          * as specified in the passed interface name. If the
409          * name is "" or the option length is zero the socket
410          * is not bound.
411          */
412         if (optlen > IFNAMSIZ - 1)
413                 optlen = IFNAMSIZ - 1;
414         memset(devname, 0, sizeof(devname));
415
416         ret = -EFAULT;
417         if (copy_from_user(devname, optval, optlen))
418                 goto out;
419
420         if (devname[0] == '\0') {
421                 index = 0;
422         } else {
423                 struct net_device *dev = dev_get_by_name(net, devname);
424
425                 ret = -ENODEV;
426                 if (!dev)
427                         goto out;
428
429                 index = dev->ifindex;
430                 dev_put(dev);
431         }
432
433         lock_sock(sk);
434         sk->sk_bound_dev_if = index;
435         sk_dst_reset(sk);
436         release_sock(sk);
437
438         ret = 0;
439
440 out:
441 #endif
442
443         return ret;
444 }
445
446 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
447 {
448         if (valbool)
449                 sock_set_flag(sk, bit);
450         else
451                 sock_reset_flag(sk, bit);
452 }
453
454 /*
455  *      This is meant for all protocols to use and covers goings on
456  *      at the socket level. Everything here is generic.
457  */
458
459 int sock_setsockopt(struct socket *sock, int level, int optname,
460                     char __user *optval, unsigned int optlen)
461 {
462         struct sock *sk = sock->sk;
463         int val;
464         int valbool;
465         struct linger ling;
466         int ret = 0;
467
468         /*
469          *      Options without arguments
470          */
471
472         if (optname == SO_BINDTODEVICE)
473                 return sock_bindtodevice(sk, optval, optlen);
474
475         if (optlen < sizeof(int))
476                 return -EINVAL;
477
478         if (get_user(val, (int __user *)optval))
479                 return -EFAULT;
480
481         valbool = val ? 1 : 0;
482
483         lock_sock(sk);
484
485         switch (optname) {
486         case SO_DEBUG:
487                 if (val && !capable(CAP_NET_ADMIN))
488                         ret = -EACCES;
489                 else
490                         sock_valbool_flag(sk, SOCK_DBG, valbool);
491                 break;
492         case SO_REUSEADDR:
493                 sk->sk_reuse = valbool;
494                 break;
495         case SO_TYPE:
496         case SO_PROTOCOL:
497         case SO_DOMAIN:
498         case SO_ERROR:
499                 ret = -ENOPROTOOPT;
500                 break;
501         case SO_DONTROUTE:
502                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
503                 break;
504         case SO_BROADCAST:
505                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
506                 break;
507         case SO_SNDBUF:
508                 /* Don't error on this BSD doesn't and if you think
509                    about it this is right. Otherwise apps have to
510                    play 'guess the biggest size' games. RCVBUF/SNDBUF
511                    are treated in BSD as hints */
512
513                 if (val > sysctl_wmem_max)
514                         val = sysctl_wmem_max;
515 set_sndbuf:
516                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
517                 if ((val * 2) < SOCK_MIN_SNDBUF)
518                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
519                 else
520                         sk->sk_sndbuf = val * 2;
521
522                 /*
523                  *      Wake up sending tasks if we
524                  *      upped the value.
525                  */
526                 sk->sk_write_space(sk);
527                 break;
528
529         case SO_SNDBUFFORCE:
530                 if (!capable(CAP_NET_ADMIN)) {
531                         ret = -EPERM;
532                         break;
533                 }
534                 goto set_sndbuf;
535
536         case SO_RCVBUF:
537                 /* Don't error on this BSD doesn't and if you think
538                    about it this is right. Otherwise apps have to
539                    play 'guess the biggest size' games. RCVBUF/SNDBUF
540                    are treated in BSD as hints */
541
542                 if (val > sysctl_rmem_max)
543                         val = sysctl_rmem_max;
544 set_rcvbuf:
545                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
546                 /*
547                  * We double it on the way in to account for
548                  * "struct sk_buff" etc. overhead.   Applications
549                  * assume that the SO_RCVBUF setting they make will
550                  * allow that much actual data to be received on that
551                  * socket.
552                  *
553                  * Applications are unaware that "struct sk_buff" and
554                  * other overheads allocate from the receive buffer
555                  * during socket buffer allocation.
556                  *
557                  * And after considering the possible alternatives,
558                  * returning the value we actually used in getsockopt
559                  * is the most desirable behavior.
560                  */
561                 if ((val * 2) < SOCK_MIN_RCVBUF)
562                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
563                 else
564                         sk->sk_rcvbuf = val * 2;
565                 break;
566
567         case SO_RCVBUFFORCE:
568                 if (!capable(CAP_NET_ADMIN)) {
569                         ret = -EPERM;
570                         break;
571                 }
572                 goto set_rcvbuf;
573
574         case SO_KEEPALIVE:
575 #ifdef CONFIG_INET
576                 if (sk->sk_protocol == IPPROTO_TCP)
577                         tcp_set_keepalive(sk, valbool);
578 #endif
579                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
580                 break;
581
582         case SO_OOBINLINE:
583                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
584                 break;
585
586         case SO_NO_CHECK:
587                 sk->sk_no_check = valbool;
588                 break;
589
590         case SO_PRIORITY:
591                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
592                         sk->sk_priority = val;
593                 else
594                         ret = -EPERM;
595                 break;
596
597         case SO_LINGER:
598                 if (optlen < sizeof(ling)) {
599                         ret = -EINVAL;  /* 1003.1g */
600                         break;
601                 }
602                 if (copy_from_user(&ling, optval, sizeof(ling))) {
603                         ret = -EFAULT;
604                         break;
605                 }
606                 if (!ling.l_onoff)
607                         sock_reset_flag(sk, SOCK_LINGER);
608                 else {
609 #if (BITS_PER_LONG == 32)
610                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
611                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
612                         else
613 #endif
614                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
615                         sock_set_flag(sk, SOCK_LINGER);
616                 }
617                 break;
618
619         case SO_BSDCOMPAT:
620                 sock_warn_obsolete_bsdism("setsockopt");
621                 break;
622
623         case SO_PASSCRED:
624                 if (valbool)
625                         set_bit(SOCK_PASSCRED, &sock->flags);
626                 else
627                         clear_bit(SOCK_PASSCRED, &sock->flags);
628                 break;
629
630         case SO_TIMESTAMP:
631         case SO_TIMESTAMPNS:
632                 if (valbool)  {
633                         if (optname == SO_TIMESTAMP)
634                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
635                         else
636                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
637                         sock_set_flag(sk, SOCK_RCVTSTAMP);
638                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
639                 } else {
640                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
641                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
642                 }
643                 break;
644
645         case SO_TIMESTAMPING:
646                 if (val & ~SOF_TIMESTAMPING_MASK) {
647                         ret = -EINVAL;
648                         break;
649                 }
650                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
651                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
652                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
653                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
654                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
655                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
656                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
657                         sock_enable_timestamp(sk,
658                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
659                 else
660                         sock_disable_timestamp(sk,
661                                                SOCK_TIMESTAMPING_RX_SOFTWARE);
662                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
663                                   val & SOF_TIMESTAMPING_SOFTWARE);
664                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
665                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
666                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
667                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
668                 break;
669
670         case SO_RCVLOWAT:
671                 if (val < 0)
672                         val = INT_MAX;
673                 sk->sk_rcvlowat = val ? : 1;
674                 break;
675
676         case SO_RCVTIMEO:
677                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
678                 break;
679
680         case SO_SNDTIMEO:
681                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
682                 break;
683
684         case SO_ATTACH_FILTER:
685                 ret = -EINVAL;
686                 if (optlen == sizeof(struct sock_fprog)) {
687                         struct sock_fprog fprog;
688
689                         ret = -EFAULT;
690                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
691                                 break;
692
693                         ret = sk_attach_filter(&fprog, sk);
694                 }
695                 break;
696
697         case SO_DETACH_FILTER:
698                 ret = sk_detach_filter(sk);
699                 break;
700
701         case SO_PASSSEC:
702                 if (valbool)
703                         set_bit(SOCK_PASSSEC, &sock->flags);
704                 else
705                         clear_bit(SOCK_PASSSEC, &sock->flags);
706                 break;
707         case SO_MARK:
708                 if (!capable(CAP_NET_ADMIN))
709                         ret = -EPERM;
710                 else
711                         sk->sk_mark = val;
712                 break;
713
714                 /* We implement the SO_SNDLOWAT etc to
715                    not be settable (1003.1g 5.3) */
716         case SO_RXQ_OVFL:
717                 if (valbool)
718                         sock_set_flag(sk, SOCK_RXQ_OVFL);
719                 else
720                         sock_reset_flag(sk, SOCK_RXQ_OVFL);
721                 break;
722         default:
723                 ret = -ENOPROTOOPT;
724                 break;
725         }
726         release_sock(sk);
727         return ret;
728 }
729 EXPORT_SYMBOL(sock_setsockopt);
730
731
732 int sock_getsockopt(struct socket *sock, int level, int optname,
733                     char __user *optval, int __user *optlen)
734 {
735         struct sock *sk = sock->sk;
736
737         union {
738                 int val;
739                 struct linger ling;
740                 struct timeval tm;
741         } v;
742
743         unsigned int lv = sizeof(int);
744         int len;
745
746         if (get_user(len, optlen))
747                 return -EFAULT;
748         if (len < 0)
749                 return -EINVAL;
750
751         memset(&v, 0, sizeof(v));
752
753         switch (optname) {
754         case SO_DEBUG:
755                 v.val = sock_flag(sk, SOCK_DBG);
756                 break;
757
758         case SO_DONTROUTE:
759                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
760                 break;
761
762         case SO_BROADCAST:
763                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
764                 break;
765
766         case SO_SNDBUF:
767                 v.val = sk->sk_sndbuf;
768                 break;
769
770         case SO_RCVBUF:
771                 v.val = sk->sk_rcvbuf;
772                 break;
773
774         case SO_REUSEADDR:
775                 v.val = sk->sk_reuse;
776                 break;
777
778         case SO_KEEPALIVE:
779                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
780                 break;
781
782         case SO_TYPE:
783                 v.val = sk->sk_type;
784                 break;
785
786         case SO_PROTOCOL:
787                 v.val = sk->sk_protocol;
788                 break;
789
790         case SO_DOMAIN:
791                 v.val = sk->sk_family;
792                 break;
793
794         case SO_ERROR:
795                 v.val = -sock_error(sk);
796                 if (v.val == 0)
797                         v.val = xchg(&sk->sk_err_soft, 0);
798                 break;
799
800         case SO_OOBINLINE:
801                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
802                 break;
803
804         case SO_NO_CHECK:
805                 v.val = sk->sk_no_check;
806                 break;
807
808         case SO_PRIORITY:
809                 v.val = sk->sk_priority;
810                 break;
811
812         case SO_LINGER:
813                 lv              = sizeof(v.ling);
814                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
815                 v.ling.l_linger = sk->sk_lingertime / HZ;
816                 break;
817
818         case SO_BSDCOMPAT:
819                 sock_warn_obsolete_bsdism("getsockopt");
820                 break;
821
822         case SO_TIMESTAMP:
823                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
824                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
825                 break;
826
827         case SO_TIMESTAMPNS:
828                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
829                 break;
830
831         case SO_TIMESTAMPING:
832                 v.val = 0;
833                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
834                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
835                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
836                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
837                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
838                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
839                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
840                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
841                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
842                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
843                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
844                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
845                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
846                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
847                 break;
848
849         case SO_RCVTIMEO:
850                 lv = sizeof(struct timeval);
851                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
852                         v.tm.tv_sec = 0;
853                         v.tm.tv_usec = 0;
854                 } else {
855                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
856                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
857                 }
858                 break;
859
860         case SO_SNDTIMEO:
861                 lv = sizeof(struct timeval);
862                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
863                         v.tm.tv_sec = 0;
864                         v.tm.tv_usec = 0;
865                 } else {
866                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
867                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
868                 }
869                 break;
870
871         case SO_RCVLOWAT:
872                 v.val = sk->sk_rcvlowat;
873                 break;
874
875         case SO_SNDLOWAT:
876                 v.val = 1;
877                 break;
878
879         case SO_PASSCRED:
880                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
881                 break;
882
883         case SO_PEERCRED:
884                 if (len > sizeof(sk->sk_peercred))
885                         len = sizeof(sk->sk_peercred);
886                 if (copy_to_user(optval, &sk->sk_peercred, len))
887                         return -EFAULT;
888                 goto lenout;
889
890         case SO_PEERNAME:
891         {
892                 char address[128];
893
894                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
895                         return -ENOTCONN;
896                 if (lv < len)
897                         return -EINVAL;
898                 if (copy_to_user(optval, address, len))
899                         return -EFAULT;
900                 goto lenout;
901         }
902
903         /* Dubious BSD thing... Probably nobody even uses it, but
904          * the UNIX standard wants it for whatever reason... -DaveM
905          */
906         case SO_ACCEPTCONN:
907                 v.val = sk->sk_state == TCP_LISTEN;
908                 break;
909
910         case SO_PASSSEC:
911                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
912                 break;
913
914         case SO_PEERSEC:
915                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
916
917         case SO_MARK:
918                 v.val = sk->sk_mark;
919                 break;
920
921         case SO_RXQ_OVFL:
922                 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
923                 break;
924
925         default:
926                 return -ENOPROTOOPT;
927         }
928
929         if (len > lv)
930                 len = lv;
931         if (copy_to_user(optval, &v, len))
932                 return -EFAULT;
933 lenout:
934         if (put_user(len, optlen))
935                 return -EFAULT;
936         return 0;
937 }
938
939 /*
940  * Initialize an sk_lock.
941  *
942  * (We also register the sk_lock with the lock validator.)
943  */
944 static inline void sock_lock_init(struct sock *sk)
945 {
946         sock_lock_init_class_and_name(sk,
947                         af_family_slock_key_strings[sk->sk_family],
948                         af_family_slock_keys + sk->sk_family,
949                         af_family_key_strings[sk->sk_family],
950                         af_family_keys + sk->sk_family);
951 }
952
953 /*
954  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
955  * even temporarly, because of RCU lookups. sk_node should also be left as is.
956  */
957 static void sock_copy(struct sock *nsk, const struct sock *osk)
958 {
959 #ifdef CONFIG_SECURITY_NETWORK
960         void *sptr = nsk->sk_security;
961 #endif
962         BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
963                      sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
964                      sizeof(osk->sk_tx_queue_mapping));
965         memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
966                osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
967 #ifdef CONFIG_SECURITY_NETWORK
968         nsk->sk_security = sptr;
969         security_sk_clone(osk, nsk);
970 #endif
971 }
972
973 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
974                 int family)
975 {
976         struct sock *sk;
977         struct kmem_cache *slab;
978
979         slab = prot->slab;
980         if (slab != NULL) {
981                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
982                 if (!sk)
983                         return sk;
984                 if (priority & __GFP_ZERO) {
985                         /*
986                          * caches using SLAB_DESTROY_BY_RCU should let
987                          * sk_node.next un-modified. Special care is taken
988                          * when initializing object to zero.
989                          */
990                         if (offsetof(struct sock, sk_node.next) != 0)
991                                 memset(sk, 0, offsetof(struct sock, sk_node.next));
992                         memset(&sk->sk_node.pprev, 0,
993                                prot->obj_size - offsetof(struct sock,
994                                                          sk_node.pprev));
995                 }
996         }
997         else
998                 sk = kmalloc(prot->obj_size, priority);
999
1000         if (sk != NULL) {
1001                 kmemcheck_annotate_bitfield(sk, flags);
1002
1003                 if (security_sk_alloc(sk, family, priority))
1004                         goto out_free;
1005
1006                 if (!try_module_get(prot->owner))
1007                         goto out_free_sec;
1008                 sk_tx_queue_clear(sk);
1009         }
1010
1011         return sk;
1012
1013 out_free_sec:
1014         security_sk_free(sk);
1015 out_free:
1016         if (slab != NULL)
1017                 kmem_cache_free(slab, sk);
1018         else
1019                 kfree(sk);
1020         return NULL;
1021 }
1022
1023 static void sk_prot_free(struct proto *prot, struct sock *sk)
1024 {
1025         struct kmem_cache *slab;
1026         struct module *owner;
1027
1028         owner = prot->owner;
1029         slab = prot->slab;
1030
1031         security_sk_free(sk);
1032         if (slab != NULL)
1033                 kmem_cache_free(slab, sk);
1034         else
1035                 kfree(sk);
1036         module_put(owner);
1037 }
1038
1039 /**
1040  *      sk_alloc - All socket objects are allocated here
1041  *      @net: the applicable net namespace
1042  *      @family: protocol family
1043  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1044  *      @prot: struct proto associated with this new sock instance
1045  */
1046 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1047                       struct proto *prot)
1048 {
1049         struct sock *sk;
1050
1051         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1052         if (sk) {
1053                 sk->sk_family = family;
1054                 /*
1055                  * See comment in struct sock definition to understand
1056                  * why we need sk_prot_creator -acme
1057                  */
1058                 sk->sk_prot = sk->sk_prot_creator = prot;
1059                 sock_lock_init(sk);
1060                 sock_net_set(sk, get_net(net));
1061                 atomic_set(&sk->sk_wmem_alloc, 1);
1062         }
1063
1064         return sk;
1065 }
1066 EXPORT_SYMBOL(sk_alloc);
1067
1068 static void __sk_free(struct sock *sk)
1069 {
1070         struct sk_filter *filter;
1071
1072         if (sk->sk_destruct)
1073                 sk->sk_destruct(sk);
1074
1075         filter = rcu_dereference(sk->sk_filter);
1076         if (filter) {
1077                 sk_filter_uncharge(sk, filter);
1078                 rcu_assign_pointer(sk->sk_filter, NULL);
1079         }
1080
1081         sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1082         sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1083
1084         if (atomic_read(&sk->sk_omem_alloc))
1085                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1086                        __func__, atomic_read(&sk->sk_omem_alloc));
1087
1088         put_net(sock_net(sk));
1089         sk_prot_free(sk->sk_prot_creator, sk);
1090 }
1091
1092 void sk_free(struct sock *sk)
1093 {
1094         /*
1095          * We substract one from sk_wmem_alloc and can know if
1096          * some packets are still in some tx queue.
1097          * If not null, sock_wfree() will call __sk_free(sk) later
1098          */
1099         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1100                 __sk_free(sk);
1101 }
1102 EXPORT_SYMBOL(sk_free);
1103
1104 /*
1105  * Last sock_put should drop referrence to sk->sk_net. It has already
1106  * been dropped in sk_change_net. Taking referrence to stopping namespace
1107  * is not an option.
1108  * Take referrence to a socket to remove it from hash _alive_ and after that
1109  * destroy it in the context of init_net.
1110  */
1111 void sk_release_kernel(struct sock *sk)
1112 {
1113         if (sk == NULL || sk->sk_socket == NULL)
1114                 return;
1115
1116         sock_hold(sk);
1117         sock_release(sk->sk_socket);
1118         release_net(sock_net(sk));
1119         sock_net_set(sk, get_net(&init_net));
1120         sock_put(sk);
1121 }
1122 EXPORT_SYMBOL(sk_release_kernel);
1123
1124 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1125 {
1126         struct sock *newsk;
1127
1128         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1129         if (newsk != NULL) {
1130                 struct sk_filter *filter;
1131
1132                 sock_copy(newsk, sk);
1133
1134                 /* SANITY */
1135                 get_net(sock_net(newsk));
1136                 sk_node_init(&newsk->sk_node);
1137                 sock_lock_init(newsk);
1138                 bh_lock_sock(newsk);
1139                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1140
1141                 atomic_set(&newsk->sk_rmem_alloc, 0);
1142                 /*
1143                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1144                  */
1145                 atomic_set(&newsk->sk_wmem_alloc, 1);
1146                 atomic_set(&newsk->sk_omem_alloc, 0);
1147                 skb_queue_head_init(&newsk->sk_receive_queue);
1148                 skb_queue_head_init(&newsk->sk_write_queue);
1149 #ifdef CONFIG_NET_DMA
1150                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1151 #endif
1152
1153                 rwlock_init(&newsk->sk_dst_lock);
1154                 rwlock_init(&newsk->sk_callback_lock);
1155                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1156                                 af_callback_keys + newsk->sk_family,
1157                                 af_family_clock_key_strings[newsk->sk_family]);
1158
1159                 newsk->sk_dst_cache     = NULL;
1160                 newsk->sk_wmem_queued   = 0;
1161                 newsk->sk_forward_alloc = 0;
1162                 newsk->sk_send_head     = NULL;
1163                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1164
1165                 sock_reset_flag(newsk, SOCK_DONE);
1166                 skb_queue_head_init(&newsk->sk_error_queue);
1167
1168                 filter = newsk->sk_filter;
1169                 if (filter != NULL)
1170                         sk_filter_charge(newsk, filter);
1171
1172                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1173                         /* It is still raw copy of parent, so invalidate
1174                          * destructor and make plain sk_free() */
1175                         newsk->sk_destruct = NULL;
1176                         sk_free(newsk);
1177                         newsk = NULL;
1178                         goto out;
1179                 }
1180
1181                 newsk->sk_err      = 0;
1182                 newsk->sk_priority = 0;
1183                 /*
1184                  * Before updating sk_refcnt, we must commit prior changes to memory
1185                  * (Documentation/RCU/rculist_nulls.txt for details)
1186                  */
1187                 smp_wmb();
1188                 atomic_set(&newsk->sk_refcnt, 2);
1189
1190                 /*
1191                  * Increment the counter in the same struct proto as the master
1192                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1193                  * is the same as sk->sk_prot->socks, as this field was copied
1194                  * with memcpy).
1195                  *
1196                  * This _changes_ the previous behaviour, where
1197                  * tcp_create_openreq_child always was incrementing the
1198                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1199                  * to be taken into account in all callers. -acme
1200                  */
1201                 sk_refcnt_debug_inc(newsk);
1202                 sk_set_socket(newsk, NULL);
1203                 newsk->sk_sleep  = NULL;
1204
1205                 if (newsk->sk_prot->sockets_allocated)
1206                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1207         }
1208 out:
1209         return newsk;
1210 }
1211 EXPORT_SYMBOL_GPL(sk_clone);
1212
1213 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1214 {
1215         __sk_dst_set(sk, dst);
1216         sk->sk_route_caps = dst->dev->features;
1217         if (sk->sk_route_caps & NETIF_F_GSO)
1218                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1219         if (sk_can_gso(sk)) {
1220                 if (dst->header_len) {
1221                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1222                 } else {
1223                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1224                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1225                 }
1226         }
1227 }
1228 EXPORT_SYMBOL_GPL(sk_setup_caps);
1229
1230 void __init sk_init(void)
1231 {
1232         if (totalram_pages <= 4096) {
1233                 sysctl_wmem_max = 32767;
1234                 sysctl_rmem_max = 32767;
1235                 sysctl_wmem_default = 32767;
1236                 sysctl_rmem_default = 32767;
1237         } else if (totalram_pages >= 131072) {
1238                 sysctl_wmem_max = 131071;
1239                 sysctl_rmem_max = 131071;
1240         }
1241 }
1242
1243 /*
1244  *      Simple resource managers for sockets.
1245  */
1246
1247
1248 /*
1249  * Write buffer destructor automatically called from kfree_skb.
1250  */
1251 void sock_wfree(struct sk_buff *skb)
1252 {
1253         struct sock *sk = skb->sk;
1254         unsigned int len = skb->truesize;
1255
1256         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1257                 /*
1258                  * Keep a reference on sk_wmem_alloc, this will be released
1259                  * after sk_write_space() call
1260                  */
1261                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1262                 sk->sk_write_space(sk);
1263                 len = 1;
1264         }
1265         /*
1266          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1267          * could not do because of in-flight packets
1268          */
1269         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1270                 __sk_free(sk);
1271 }
1272 EXPORT_SYMBOL(sock_wfree);
1273
1274 /*
1275  * Read buffer destructor automatically called from kfree_skb.
1276  */
1277 void sock_rfree(struct sk_buff *skb)
1278 {
1279         struct sock *sk = skb->sk;
1280
1281         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1282         sk_mem_uncharge(skb->sk, skb->truesize);
1283 }
1284 EXPORT_SYMBOL(sock_rfree);
1285
1286
1287 int sock_i_uid(struct sock *sk)
1288 {
1289         int uid;
1290
1291         read_lock(&sk->sk_callback_lock);
1292         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1293         read_unlock(&sk->sk_callback_lock);
1294         return uid;
1295 }
1296 EXPORT_SYMBOL(sock_i_uid);
1297
1298 unsigned long sock_i_ino(struct sock *sk)
1299 {
1300         unsigned long ino;
1301
1302         read_lock(&sk->sk_callback_lock);
1303         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1304         read_unlock(&sk->sk_callback_lock);
1305         return ino;
1306 }
1307 EXPORT_SYMBOL(sock_i_ino);
1308
1309 /*
1310  * Allocate a skb from the socket's send buffer.
1311  */
1312 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1313                              gfp_t priority)
1314 {
1315         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1316                 struct sk_buff *skb = alloc_skb(size, priority);
1317                 if (skb) {
1318                         skb_set_owner_w(skb, sk);
1319                         return skb;
1320                 }
1321         }
1322         return NULL;
1323 }
1324 EXPORT_SYMBOL(sock_wmalloc);
1325
1326 /*
1327  * Allocate a skb from the socket's receive buffer.
1328  */
1329 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1330                              gfp_t priority)
1331 {
1332         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1333                 struct sk_buff *skb = alloc_skb(size, priority);
1334                 if (skb) {
1335                         skb_set_owner_r(skb, sk);
1336                         return skb;
1337                 }
1338         }
1339         return NULL;
1340 }
1341
1342 /*
1343  * Allocate a memory block from the socket's option memory buffer.
1344  */
1345 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1346 {
1347         if ((unsigned)size <= sysctl_optmem_max &&
1348             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1349                 void *mem;
1350                 /* First do the add, to avoid the race if kmalloc
1351                  * might sleep.
1352                  */
1353                 atomic_add(size, &sk->sk_omem_alloc);
1354                 mem = kmalloc(size, priority);
1355                 if (mem)
1356                         return mem;
1357                 atomic_sub(size, &sk->sk_omem_alloc);
1358         }
1359         return NULL;
1360 }
1361 EXPORT_SYMBOL(sock_kmalloc);
1362
1363 /*
1364  * Free an option memory block.
1365  */
1366 void sock_kfree_s(struct sock *sk, void *mem, int size)
1367 {
1368         kfree(mem);
1369         atomic_sub(size, &sk->sk_omem_alloc);
1370 }
1371 EXPORT_SYMBOL(sock_kfree_s);
1372
1373 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1374    I think, these locks should be removed for datagram sockets.
1375  */
1376 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1377 {
1378         DEFINE_WAIT(wait);
1379
1380         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1381         for (;;) {
1382                 if (!timeo)
1383                         break;
1384                 if (signal_pending(current))
1385                         break;
1386                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1387                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1388                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1389                         break;
1390                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1391                         break;
1392                 if (sk->sk_err)
1393                         break;
1394                 timeo = schedule_timeout(timeo);
1395         }
1396         finish_wait(sk->sk_sleep, &wait);
1397         return timeo;
1398 }
1399
1400
1401 /*
1402  *      Generic send/receive buffer handlers
1403  */
1404
1405 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1406                                      unsigned long data_len, int noblock,
1407                                      int *errcode)
1408 {
1409         struct sk_buff *skb;
1410         gfp_t gfp_mask;
1411         long timeo;
1412         int err;
1413
1414         gfp_mask = sk->sk_allocation;
1415         if (gfp_mask & __GFP_WAIT)
1416                 gfp_mask |= __GFP_REPEAT;
1417
1418         timeo = sock_sndtimeo(sk, noblock);
1419         while (1) {
1420                 err = sock_error(sk);
1421                 if (err != 0)
1422                         goto failure;
1423
1424                 err = -EPIPE;
1425                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1426                         goto failure;
1427
1428                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1429                         skb = alloc_skb(header_len, gfp_mask);
1430                         if (skb) {
1431                                 int npages;
1432                                 int i;
1433
1434                                 /* No pages, we're done... */
1435                                 if (!data_len)
1436                                         break;
1437
1438                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1439                                 skb->truesize += data_len;
1440                                 skb_shinfo(skb)->nr_frags = npages;
1441                                 for (i = 0; i < npages; i++) {
1442                                         struct page *page;
1443                                         skb_frag_t *frag;
1444
1445                                         page = alloc_pages(sk->sk_allocation, 0);
1446                                         if (!page) {
1447                                                 err = -ENOBUFS;
1448                                                 skb_shinfo(skb)->nr_frags = i;
1449                                                 kfree_skb(skb);
1450                                                 goto failure;
1451                                         }
1452
1453                                         frag = &skb_shinfo(skb)->frags[i];
1454                                         frag->page = page;
1455                                         frag->page_offset = 0;
1456                                         frag->size = (data_len >= PAGE_SIZE ?
1457                                                       PAGE_SIZE :
1458                                                       data_len);
1459                                         data_len -= PAGE_SIZE;
1460                                 }
1461
1462                                 /* Full success... */
1463                                 break;
1464                         }
1465                         err = -ENOBUFS;
1466                         goto failure;
1467                 }
1468                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1469                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1470                 err = -EAGAIN;
1471                 if (!timeo)
1472                         goto failure;
1473                 if (signal_pending(current))
1474                         goto interrupted;
1475                 timeo = sock_wait_for_wmem(sk, timeo);
1476         }
1477
1478         skb_set_owner_w(skb, sk);
1479         return skb;
1480
1481 interrupted:
1482         err = sock_intr_errno(timeo);
1483 failure:
1484         *errcode = err;
1485         return NULL;
1486 }
1487 EXPORT_SYMBOL(sock_alloc_send_pskb);
1488
1489 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1490                                     int noblock, int *errcode)
1491 {
1492         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1493 }
1494 EXPORT_SYMBOL(sock_alloc_send_skb);
1495
1496 static void __lock_sock(struct sock *sk)
1497 {
1498         DEFINE_WAIT(wait);
1499
1500         for (;;) {
1501                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1502                                         TASK_UNINTERRUPTIBLE);
1503                 spin_unlock_bh(&sk->sk_lock.slock);
1504                 schedule();
1505                 spin_lock_bh(&sk->sk_lock.slock);
1506                 if (!sock_owned_by_user(sk))
1507                         break;
1508         }
1509         finish_wait(&sk->sk_lock.wq, &wait);
1510 }
1511
1512 static void __release_sock(struct sock *sk)
1513 {
1514         struct sk_buff *skb = sk->sk_backlog.head;
1515
1516         do {
1517                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1518                 bh_unlock_sock(sk);
1519
1520                 do {
1521                         struct sk_buff *next = skb->next;
1522
1523                         skb->next = NULL;
1524                         sk_backlog_rcv(sk, skb);
1525
1526                         /*
1527                          * We are in process context here with softirqs
1528                          * disabled, use cond_resched_softirq() to preempt.
1529                          * This is safe to do because we've taken the backlog
1530                          * queue private:
1531                          */
1532                         cond_resched_softirq();
1533
1534                         skb = next;
1535                 } while (skb != NULL);
1536
1537                 bh_lock_sock(sk);
1538         } while ((skb = sk->sk_backlog.head) != NULL);
1539 }
1540
1541 /**
1542  * sk_wait_data - wait for data to arrive at sk_receive_queue
1543  * @sk:    sock to wait on
1544  * @timeo: for how long
1545  *
1546  * Now socket state including sk->sk_err is changed only under lock,
1547  * hence we may omit checks after joining wait queue.
1548  * We check receive queue before schedule() only as optimization;
1549  * it is very likely that release_sock() added new data.
1550  */
1551 int sk_wait_data(struct sock *sk, long *timeo)
1552 {
1553         int rc;
1554         DEFINE_WAIT(wait);
1555
1556         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1557         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1558         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1559         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1560         finish_wait(sk->sk_sleep, &wait);
1561         return rc;
1562 }
1563 EXPORT_SYMBOL(sk_wait_data);
1564
1565 /**
1566  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1567  *      @sk: socket
1568  *      @size: memory size to allocate
1569  *      @kind: allocation type
1570  *
1571  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1572  *      rmem allocation. This function assumes that protocols which have
1573  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1574  */
1575 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1576 {
1577         struct proto *prot = sk->sk_prot;
1578         int amt = sk_mem_pages(size);
1579         int allocated;
1580
1581         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1582         allocated = atomic_add_return(amt, prot->memory_allocated);
1583
1584         /* Under limit. */
1585         if (allocated <= prot->sysctl_mem[0]) {
1586                 if (prot->memory_pressure && *prot->memory_pressure)
1587                         *prot->memory_pressure = 0;
1588                 return 1;
1589         }
1590
1591         /* Under pressure. */
1592         if (allocated > prot->sysctl_mem[1])
1593                 if (prot->enter_memory_pressure)
1594                         prot->enter_memory_pressure(sk);
1595
1596         /* Over hard limit. */
1597         if (allocated > prot->sysctl_mem[2])
1598                 goto suppress_allocation;
1599
1600         /* guarantee minimum buffer size under pressure */
1601         if (kind == SK_MEM_RECV) {
1602                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1603                         return 1;
1604         } else { /* SK_MEM_SEND */
1605                 if (sk->sk_type == SOCK_STREAM) {
1606                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1607                                 return 1;
1608                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1609                            prot->sysctl_wmem[0])
1610                                 return 1;
1611         }
1612
1613         if (prot->memory_pressure) {
1614                 int alloc;
1615
1616                 if (!*prot->memory_pressure)
1617                         return 1;
1618                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1619                 if (prot->sysctl_mem[2] > alloc *
1620                     sk_mem_pages(sk->sk_wmem_queued +
1621                                  atomic_read(&sk->sk_rmem_alloc) +
1622                                  sk->sk_forward_alloc))
1623                         return 1;
1624         }
1625
1626 suppress_allocation:
1627
1628         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1629                 sk_stream_moderate_sndbuf(sk);
1630
1631                 /* Fail only if socket is _under_ its sndbuf.
1632                  * In this case we cannot block, so that we have to fail.
1633                  */
1634                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1635                         return 1;
1636         }
1637
1638         /* Alas. Undo changes. */
1639         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1640         atomic_sub(amt, prot->memory_allocated);
1641         return 0;
1642 }
1643 EXPORT_SYMBOL(__sk_mem_schedule);
1644
1645 /**
1646  *      __sk_reclaim - reclaim memory_allocated
1647  *      @sk: socket
1648  */
1649 void __sk_mem_reclaim(struct sock *sk)
1650 {
1651         struct proto *prot = sk->sk_prot;
1652
1653         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1654                    prot->memory_allocated);
1655         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1656
1657         if (prot->memory_pressure && *prot->memory_pressure &&
1658             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1659                 *prot->memory_pressure = 0;
1660 }
1661 EXPORT_SYMBOL(__sk_mem_reclaim);
1662
1663
1664 /*
1665  * Set of default routines for initialising struct proto_ops when
1666  * the protocol does not support a particular function. In certain
1667  * cases where it makes no sense for a protocol to have a "do nothing"
1668  * function, some default processing is provided.
1669  */
1670
1671 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1672 {
1673         return -EOPNOTSUPP;
1674 }
1675 EXPORT_SYMBOL(sock_no_bind);
1676
1677 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1678                     int len, int flags)
1679 {
1680         return -EOPNOTSUPP;
1681 }
1682 EXPORT_SYMBOL(sock_no_connect);
1683
1684 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1685 {
1686         return -EOPNOTSUPP;
1687 }
1688 EXPORT_SYMBOL(sock_no_socketpair);
1689
1690 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1691 {
1692         return -EOPNOTSUPP;
1693 }
1694 EXPORT_SYMBOL(sock_no_accept);
1695
1696 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1697                     int *len, int peer)
1698 {
1699         return -EOPNOTSUPP;
1700 }
1701 EXPORT_SYMBOL(sock_no_getname);
1702
1703 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1704 {
1705         return 0;
1706 }
1707 EXPORT_SYMBOL(sock_no_poll);
1708
1709 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1710 {
1711         return -EOPNOTSUPP;
1712 }
1713 EXPORT_SYMBOL(sock_no_ioctl);
1714
1715 int sock_no_listen(struct socket *sock, int backlog)
1716 {
1717         return -EOPNOTSUPP;
1718 }
1719 EXPORT_SYMBOL(sock_no_listen);
1720
1721 int sock_no_shutdown(struct socket *sock, int how)
1722 {
1723         return -EOPNOTSUPP;
1724 }
1725 EXPORT_SYMBOL(sock_no_shutdown);
1726
1727 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1728                     char __user *optval, unsigned int optlen)
1729 {
1730         return -EOPNOTSUPP;
1731 }
1732 EXPORT_SYMBOL(sock_no_setsockopt);
1733
1734 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1735                     char __user *optval, int __user *optlen)
1736 {
1737         return -EOPNOTSUPP;
1738 }
1739 EXPORT_SYMBOL(sock_no_getsockopt);
1740
1741 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1742                     size_t len)
1743 {
1744         return -EOPNOTSUPP;
1745 }
1746 EXPORT_SYMBOL(sock_no_sendmsg);
1747
1748 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1749                     size_t len, int flags)
1750 {
1751         return -EOPNOTSUPP;
1752 }
1753 EXPORT_SYMBOL(sock_no_recvmsg);
1754
1755 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1756 {
1757         /* Mirror missing mmap method error code */
1758         return -ENODEV;
1759 }
1760 EXPORT_SYMBOL(sock_no_mmap);
1761
1762 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1763 {
1764         ssize_t res;
1765         struct msghdr msg = {.msg_flags = flags};
1766         struct kvec iov;
1767         char *kaddr = kmap(page);
1768         iov.iov_base = kaddr + offset;
1769         iov.iov_len = size;
1770         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1771         kunmap(page);
1772         return res;
1773 }
1774 EXPORT_SYMBOL(sock_no_sendpage);
1775
1776 /*
1777  *      Default Socket Callbacks
1778  */
1779
1780 static void sock_def_wakeup(struct sock *sk)
1781 {
1782         read_lock(&sk->sk_callback_lock);
1783         if (sk_has_sleeper(sk))
1784                 wake_up_interruptible_all(sk->sk_sleep);
1785         read_unlock(&sk->sk_callback_lock);
1786 }
1787
1788 static void sock_def_error_report(struct sock *sk)
1789 {
1790         read_lock(&sk->sk_callback_lock);
1791         if (sk_has_sleeper(sk))
1792                 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1793         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1794         read_unlock(&sk->sk_callback_lock);
1795 }
1796
1797 static void sock_def_readable(struct sock *sk, int len)
1798 {
1799         read_lock(&sk->sk_callback_lock);
1800         if (sk_has_sleeper(sk))
1801                 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1802                                                 POLLRDNORM | POLLRDBAND);
1803         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1804         read_unlock(&sk->sk_callback_lock);
1805 }
1806
1807 static void sock_def_write_space(struct sock *sk)
1808 {
1809         read_lock(&sk->sk_callback_lock);
1810
1811         /* Do not wake up a writer until he can make "significant"
1812          * progress.  --DaveM
1813          */
1814         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1815                 if (sk_has_sleeper(sk))
1816                         wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1817                                                 POLLWRNORM | POLLWRBAND);
1818
1819                 /* Should agree with poll, otherwise some programs break */
1820                 if (sock_writeable(sk))
1821                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1822         }
1823
1824         read_unlock(&sk->sk_callback_lock);
1825 }
1826
1827 static void sock_def_destruct(struct sock *sk)
1828 {
1829         kfree(sk->sk_protinfo);
1830 }
1831
1832 void sk_send_sigurg(struct sock *sk)
1833 {
1834         if (sk->sk_socket && sk->sk_socket->file)
1835                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1836                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1837 }
1838 EXPORT_SYMBOL(sk_send_sigurg);
1839
1840 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1841                     unsigned long expires)
1842 {
1843         if (!mod_timer(timer, expires))
1844                 sock_hold(sk);
1845 }
1846 EXPORT_SYMBOL(sk_reset_timer);
1847
1848 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1849 {
1850         if (timer_pending(timer) && del_timer(timer))
1851                 __sock_put(sk);
1852 }
1853 EXPORT_SYMBOL(sk_stop_timer);
1854
1855 void sock_init_data(struct socket *sock, struct sock *sk)
1856 {
1857         skb_queue_head_init(&sk->sk_receive_queue);
1858         skb_queue_head_init(&sk->sk_write_queue);
1859         skb_queue_head_init(&sk->sk_error_queue);
1860 #ifdef CONFIG_NET_DMA
1861         skb_queue_head_init(&sk->sk_async_wait_queue);
1862 #endif
1863
1864         sk->sk_send_head        =       NULL;
1865
1866         init_timer(&sk->sk_timer);
1867
1868         sk->sk_allocation       =       GFP_KERNEL;
1869         sk->sk_rcvbuf           =       sysctl_rmem_default;
1870         sk->sk_sndbuf           =       sysctl_wmem_default;
1871         sk->sk_state            =       TCP_CLOSE;
1872         sk_set_socket(sk, sock);
1873
1874         sock_set_flag(sk, SOCK_ZAPPED);
1875
1876         if (sock) {
1877                 sk->sk_type     =       sock->type;
1878                 sk->sk_sleep    =       &sock->wait;
1879                 sock->sk        =       sk;
1880         } else
1881                 sk->sk_sleep    =       NULL;
1882
1883         rwlock_init(&sk->sk_dst_lock);
1884         rwlock_init(&sk->sk_callback_lock);
1885         lockdep_set_class_and_name(&sk->sk_callback_lock,
1886                         af_callback_keys + sk->sk_family,
1887                         af_family_clock_key_strings[sk->sk_family]);
1888
1889         sk->sk_state_change     =       sock_def_wakeup;
1890         sk->sk_data_ready       =       sock_def_readable;
1891         sk->sk_write_space      =       sock_def_write_space;
1892         sk->sk_error_report     =       sock_def_error_report;
1893         sk->sk_destruct         =       sock_def_destruct;
1894
1895         sk->sk_sndmsg_page      =       NULL;
1896         sk->sk_sndmsg_off       =       0;
1897
1898         sk->sk_peercred.pid     =       0;
1899         sk->sk_peercred.uid     =       -1;
1900         sk->sk_peercred.gid     =       -1;
1901         sk->sk_write_pending    =       0;
1902         sk->sk_rcvlowat         =       1;
1903         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1904         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1905
1906         sk->sk_stamp = ktime_set(-1L, 0);
1907
1908         /*
1909          * Before updating sk_refcnt, we must commit prior changes to memory
1910          * (Documentation/RCU/rculist_nulls.txt for details)
1911          */
1912         smp_wmb();
1913         atomic_set(&sk->sk_refcnt, 1);
1914         atomic_set(&sk->sk_drops, 0);
1915 }
1916 EXPORT_SYMBOL(sock_init_data);
1917
1918 void lock_sock_nested(struct sock *sk, int subclass)
1919 {
1920         might_sleep();
1921         spin_lock_bh(&sk->sk_lock.slock);
1922         if (sk->sk_lock.owned)
1923                 __lock_sock(sk);
1924         sk->sk_lock.owned = 1;
1925         spin_unlock(&sk->sk_lock.slock);
1926         /*
1927          * The sk_lock has mutex_lock() semantics here:
1928          */
1929         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1930         local_bh_enable();
1931 }
1932 EXPORT_SYMBOL(lock_sock_nested);
1933
1934 void release_sock(struct sock *sk)
1935 {
1936         /*
1937          * The sk_lock has mutex_unlock() semantics:
1938          */
1939         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1940
1941         spin_lock_bh(&sk->sk_lock.slock);
1942         if (sk->sk_backlog.tail)
1943                 __release_sock(sk);
1944         sk->sk_lock.owned = 0;
1945         if (waitqueue_active(&sk->sk_lock.wq))
1946                 wake_up(&sk->sk_lock.wq);
1947         spin_unlock_bh(&sk->sk_lock.slock);
1948 }
1949 EXPORT_SYMBOL(release_sock);
1950
1951 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1952 {
1953         struct timeval tv;
1954         if (!sock_flag(sk, SOCK_TIMESTAMP))
1955                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1956         tv = ktime_to_timeval(sk->sk_stamp);
1957         if (tv.tv_sec == -1)
1958                 return -ENOENT;
1959         if (tv.tv_sec == 0) {
1960                 sk->sk_stamp = ktime_get_real();
1961                 tv = ktime_to_timeval(sk->sk_stamp);
1962         }
1963         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1964 }
1965 EXPORT_SYMBOL(sock_get_timestamp);
1966
1967 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1968 {
1969         struct timespec ts;
1970         if (!sock_flag(sk, SOCK_TIMESTAMP))
1971                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1972         ts = ktime_to_timespec(sk->sk_stamp);
1973         if (ts.tv_sec == -1)
1974                 return -ENOENT;
1975         if (ts.tv_sec == 0) {
1976                 sk->sk_stamp = ktime_get_real();
1977                 ts = ktime_to_timespec(sk->sk_stamp);
1978         }
1979         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1980 }
1981 EXPORT_SYMBOL(sock_get_timestampns);
1982
1983 void sock_enable_timestamp(struct sock *sk, int flag)
1984 {
1985         if (!sock_flag(sk, flag)) {
1986                 sock_set_flag(sk, flag);
1987                 /*
1988                  * we just set one of the two flags which require net
1989                  * time stamping, but time stamping might have been on
1990                  * already because of the other one
1991                  */
1992                 if (!sock_flag(sk,
1993                                 flag == SOCK_TIMESTAMP ?
1994                                 SOCK_TIMESTAMPING_RX_SOFTWARE :
1995                                 SOCK_TIMESTAMP))
1996                         net_enable_timestamp();
1997         }
1998 }
1999
2000 /*
2001  *      Get a socket option on an socket.
2002  *
2003  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2004  *      asynchronous errors should be reported by getsockopt. We assume
2005  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2006  */
2007 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2008                            char __user *optval, int __user *optlen)
2009 {
2010         struct sock *sk = sock->sk;
2011
2012         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2013 }
2014 EXPORT_SYMBOL(sock_common_getsockopt);
2015
2016 #ifdef CONFIG_COMPAT
2017 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2018                                   char __user *optval, int __user *optlen)
2019 {
2020         struct sock *sk = sock->sk;
2021
2022         if (sk->sk_prot->compat_getsockopt != NULL)
2023                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2024                                                       optval, optlen);
2025         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2026 }
2027 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2028 #endif
2029
2030 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2031                         struct msghdr *msg, size_t size, int flags)
2032 {
2033         struct sock *sk = sock->sk;
2034         int addr_len = 0;
2035         int err;
2036
2037         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2038                                    flags & ~MSG_DONTWAIT, &addr_len);
2039         if (err >= 0)
2040                 msg->msg_namelen = addr_len;
2041         return err;
2042 }
2043 EXPORT_SYMBOL(sock_common_recvmsg);
2044
2045 /*
2046  *      Set socket options on an inet socket.
2047  */
2048 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2049                            char __user *optval, unsigned int optlen)
2050 {
2051         struct sock *sk = sock->sk;
2052
2053         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2054 }
2055 EXPORT_SYMBOL(sock_common_setsockopt);
2056
2057 #ifdef CONFIG_COMPAT
2058 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2059                                   char __user *optval, unsigned int optlen)
2060 {
2061         struct sock *sk = sock->sk;
2062
2063         if (sk->sk_prot->compat_setsockopt != NULL)
2064                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2065                                                       optval, optlen);
2066         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2067 }
2068 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2069 #endif
2070
2071 void sk_common_release(struct sock *sk)
2072 {
2073         if (sk->sk_prot->destroy)
2074                 sk->sk_prot->destroy(sk);
2075
2076         /*
2077          * Observation: when sock_common_release is called, processes have
2078          * no access to socket. But net still has.
2079          * Step one, detach it from networking:
2080          *
2081          * A. Remove from hash tables.
2082          */
2083
2084         sk->sk_prot->unhash(sk);
2085
2086         /*
2087          * In this point socket cannot receive new packets, but it is possible
2088          * that some packets are in flight because some CPU runs receiver and
2089          * did hash table lookup before we unhashed socket. They will achieve
2090          * receive queue and will be purged by socket destructor.
2091          *
2092          * Also we still have packets pending on receive queue and probably,
2093          * our own packets waiting in device queues. sock_destroy will drain
2094          * receive queue, but transmitted packets will delay socket destruction
2095          * until the last reference will be released.
2096          */
2097
2098         sock_orphan(sk);
2099
2100         xfrm_sk_free_policy(sk);
2101
2102         sk_refcnt_debug_release(sk);
2103         sock_put(sk);
2104 }
2105 EXPORT_SYMBOL(sk_common_release);
2106
2107 static DEFINE_RWLOCK(proto_list_lock);
2108 static LIST_HEAD(proto_list);
2109
2110 #ifdef CONFIG_PROC_FS
2111 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2112 struct prot_inuse {
2113         int val[PROTO_INUSE_NR];
2114 };
2115
2116 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2117
2118 #ifdef CONFIG_NET_NS
2119 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2120 {
2121         int cpu = smp_processor_id();
2122         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2123 }
2124 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2125
2126 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2127 {
2128         int cpu, idx = prot->inuse_idx;
2129         int res = 0;
2130
2131         for_each_possible_cpu(cpu)
2132                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2133
2134         return res >= 0 ? res : 0;
2135 }
2136 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2137
2138 static int sock_inuse_init_net(struct net *net)
2139 {
2140         net->core.inuse = alloc_percpu(struct prot_inuse);
2141         return net->core.inuse ? 0 : -ENOMEM;
2142 }
2143
2144 static void sock_inuse_exit_net(struct net *net)
2145 {
2146         free_percpu(net->core.inuse);
2147 }
2148
2149 static struct pernet_operations net_inuse_ops = {
2150         .init = sock_inuse_init_net,
2151         .exit = sock_inuse_exit_net,
2152 };
2153
2154 static __init int net_inuse_init(void)
2155 {
2156         if (register_pernet_subsys(&net_inuse_ops))
2157                 panic("Cannot initialize net inuse counters");
2158
2159         return 0;
2160 }
2161
2162 core_initcall(net_inuse_init);
2163 #else
2164 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2165
2166 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2167 {
2168         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2169 }
2170 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2171
2172 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2173 {
2174         int cpu, idx = prot->inuse_idx;
2175         int res = 0;
2176
2177         for_each_possible_cpu(cpu)
2178                 res += per_cpu(prot_inuse, cpu).val[idx];
2179
2180         return res >= 0 ? res : 0;
2181 }
2182 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2183 #endif
2184
2185 static void assign_proto_idx(struct proto *prot)
2186 {
2187         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2188
2189         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2190                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2191                 return;
2192         }
2193
2194         set_bit(prot->inuse_idx, proto_inuse_idx);
2195 }
2196
2197 static void release_proto_idx(struct proto *prot)
2198 {
2199         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2200                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2201 }
2202 #else
2203 static inline void assign_proto_idx(struct proto *prot)
2204 {
2205 }
2206
2207 static inline void release_proto_idx(struct proto *prot)
2208 {
2209 }
2210 #endif
2211
2212 int proto_register(struct proto *prot, int alloc_slab)
2213 {
2214         if (alloc_slab) {
2215                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2216                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2217                                         NULL);
2218
2219                 if (prot->slab == NULL) {
2220                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2221                                prot->name);
2222                         goto out;
2223                 }
2224
2225                 if (prot->rsk_prot != NULL) {
2226                         static const char mask[] = "request_sock_%s";
2227
2228                         prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2229                         if (prot->rsk_prot->slab_name == NULL)
2230                                 goto out_free_sock_slab;
2231
2232                         sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2233                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2234                                                                  prot->rsk_prot->obj_size, 0,
2235                                                                  SLAB_HWCACHE_ALIGN, NULL);
2236
2237                         if (prot->rsk_prot->slab == NULL) {
2238                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2239                                        prot->name);
2240                                 goto out_free_request_sock_slab_name;
2241                         }
2242                 }
2243
2244                 if (prot->twsk_prot != NULL) {
2245                         static const char mask[] = "tw_sock_%s";
2246
2247                         prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2248
2249                         if (prot->twsk_prot->twsk_slab_name == NULL)
2250                                 goto out_free_request_sock_slab;
2251
2252                         sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2253                         prot->twsk_prot->twsk_slab =
2254                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2255                                                   prot->twsk_prot->twsk_obj_size,
2256                                                   0,
2257                                                   SLAB_HWCACHE_ALIGN |
2258                                                         prot->slab_flags,
2259                                                   NULL);
2260                         if (prot->twsk_prot->twsk_slab == NULL)
2261                                 goto out_free_timewait_sock_slab_name;
2262                 }
2263         }
2264
2265         write_lock(&proto_list_lock);
2266         list_add(&prot->node, &proto_list);
2267         assign_proto_idx(prot);
2268         write_unlock(&proto_list_lock);
2269         return 0;
2270
2271 out_free_timewait_sock_slab_name:
2272         kfree(prot->twsk_prot->twsk_slab_name);
2273 out_free_request_sock_slab:
2274         if (prot->rsk_prot && prot->rsk_prot->slab) {
2275                 kmem_cache_destroy(prot->rsk_prot->slab);
2276                 prot->rsk_prot->slab = NULL;
2277         }
2278 out_free_request_sock_slab_name:
2279         kfree(prot->rsk_prot->slab_name);
2280 out_free_sock_slab:
2281         kmem_cache_destroy(prot->slab);
2282         prot->slab = NULL;
2283 out:
2284         return -ENOBUFS;
2285 }
2286 EXPORT_SYMBOL(proto_register);
2287
2288 void proto_unregister(struct proto *prot)
2289 {
2290         write_lock(&proto_list_lock);
2291         release_proto_idx(prot);
2292         list_del(&prot->node);
2293         write_unlock(&proto_list_lock);
2294
2295         if (prot->slab != NULL) {
2296                 kmem_cache_destroy(prot->slab);
2297                 prot->slab = NULL;
2298         }
2299
2300         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2301                 kmem_cache_destroy(prot->rsk_prot->slab);
2302                 kfree(prot->rsk_prot->slab_name);
2303                 prot->rsk_prot->slab = NULL;
2304         }
2305
2306         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2307                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2308                 kfree(prot->twsk_prot->twsk_slab_name);
2309                 prot->twsk_prot->twsk_slab = NULL;
2310         }
2311 }
2312 EXPORT_SYMBOL(proto_unregister);
2313
2314 #ifdef CONFIG_PROC_FS
2315 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2316         __acquires(proto_list_lock)
2317 {
2318         read_lock(&proto_list_lock);
2319         return seq_list_start_head(&proto_list, *pos);
2320 }
2321
2322 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2323 {
2324         return seq_list_next(v, &proto_list, pos);
2325 }
2326
2327 static void proto_seq_stop(struct seq_file *seq, void *v)
2328         __releases(proto_list_lock)
2329 {
2330         read_unlock(&proto_list_lock);
2331 }
2332
2333 static char proto_method_implemented(const void *method)
2334 {
2335         return method == NULL ? 'n' : 'y';
2336 }
2337
2338 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2339 {
2340         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2341                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2342                    proto->name,
2343                    proto->obj_size,
2344                    sock_prot_inuse_get(seq_file_net(seq), proto),
2345                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2346                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2347                    proto->max_header,
2348                    proto->slab == NULL ? "no" : "yes",
2349                    module_name(proto->owner),
2350                    proto_method_implemented(proto->close),
2351                    proto_method_implemented(proto->connect),
2352                    proto_method_implemented(proto->disconnect),
2353                    proto_method_implemented(proto->accept),
2354                    proto_method_implemented(proto->ioctl),
2355                    proto_method_implemented(proto->init),
2356                    proto_method_implemented(proto->destroy),
2357                    proto_method_implemented(proto->shutdown),
2358                    proto_method_implemented(proto->setsockopt),
2359                    proto_method_implemented(proto->getsockopt),
2360                    proto_method_implemented(proto->sendmsg),
2361                    proto_method_implemented(proto->recvmsg),
2362                    proto_method_implemented(proto->sendpage),
2363                    proto_method_implemented(proto->bind),
2364                    proto_method_implemented(proto->backlog_rcv),
2365                    proto_method_implemented(proto->hash),
2366                    proto_method_implemented(proto->unhash),
2367                    proto_method_implemented(proto->get_port),
2368                    proto_method_implemented(proto->enter_memory_pressure));
2369 }
2370
2371 static int proto_seq_show(struct seq_file *seq, void *v)
2372 {
2373         if (v == &proto_list)
2374                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2375                            "protocol",
2376                            "size",
2377                            "sockets",
2378                            "memory",
2379                            "press",
2380                            "maxhdr",
2381                            "slab",
2382                            "module",
2383                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2384         else
2385                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2386         return 0;
2387 }
2388
2389 static const struct seq_operations proto_seq_ops = {
2390         .start  = proto_seq_start,
2391         .next   = proto_seq_next,
2392         .stop   = proto_seq_stop,
2393         .show   = proto_seq_show,
2394 };
2395
2396 static int proto_seq_open(struct inode *inode, struct file *file)
2397 {
2398         return seq_open_net(inode, file, &proto_seq_ops,
2399                             sizeof(struct seq_net_private));
2400 }
2401
2402 static const struct file_operations proto_seq_fops = {
2403         .owner          = THIS_MODULE,
2404         .open           = proto_seq_open,
2405         .read           = seq_read,
2406         .llseek         = seq_lseek,
2407         .release        = seq_release_net,
2408 };
2409
2410 static __net_init int proto_init_net(struct net *net)
2411 {
2412         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2413                 return -ENOMEM;
2414
2415         return 0;
2416 }
2417
2418 static __net_exit void proto_exit_net(struct net *net)
2419 {
2420         proc_net_remove(net, "protocols");
2421 }
2422
2423
2424 static __net_initdata struct pernet_operations proto_net_ops = {
2425         .init = proto_init_net,
2426         .exit = proto_exit_net,
2427 };
2428
2429 static int __init proto_init(void)
2430 {
2431         return register_pernet_subsys(&proto_net_ops);
2432 }
2433
2434 subsys_initcall(proto_init);
2435
2436 #endif /* PROC_FS */