2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
43 #ifdef CONFIG_IP_VS_IPV6
45 #include <net/ip6_route.h>
47 #include <net/route.h>
49 #include <net/genetlink.h>
51 #include <asm/uaccess.h>
53 #include <net/ip_vs.h>
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
61 /* sysctl variables */
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
66 int ip_vs_get_debug_level(void)
68 return sysctl_ip_vs_debug_level;
74 static void __ip_vs_del_service(struct ip_vs_service *svc);
77 #ifdef CONFIG_IP_VS_IPV6
78 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
79 static bool __ip_vs_addr_is_local_v6(struct net *net,
80 const struct in6_addr *addr)
85 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
88 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
97 * update_defense_level is called from keventd and from sysctl,
98 * so it needs to protect itself from softirqs
100 static void update_defense_level(struct netns_ipvs *ipvs)
103 static int old_secure_tcp = 0;
108 /* we only count free and buffered memory (in pages) */
110 availmem = i.freeram + i.bufferram;
111 /* however in linux 2.5 the i.bufferram is total page cache size,
113 /* si_swapinfo(&i); */
114 /* availmem = availmem - (i.totalswap - i.freeswap); */
116 nomem = (availmem < ipvs->sysctl_amemthresh);
121 spin_lock(&ipvs->dropentry_lock);
122 switch (ipvs->sysctl_drop_entry) {
124 atomic_set(&ipvs->dropentry, 0);
128 atomic_set(&ipvs->dropentry, 1);
129 ipvs->sysctl_drop_entry = 2;
131 atomic_set(&ipvs->dropentry, 0);
136 atomic_set(&ipvs->dropentry, 1);
138 atomic_set(&ipvs->dropentry, 0);
139 ipvs->sysctl_drop_entry = 1;
143 atomic_set(&ipvs->dropentry, 1);
146 spin_unlock(&ipvs->dropentry_lock);
149 spin_lock(&ipvs->droppacket_lock);
150 switch (ipvs->sysctl_drop_packet) {
156 ipvs->drop_rate = ipvs->drop_counter
157 = ipvs->sysctl_amemthresh /
158 (ipvs->sysctl_amemthresh-availmem);
159 ipvs->sysctl_drop_packet = 2;
166 ipvs->drop_rate = ipvs->drop_counter
167 = ipvs->sysctl_amemthresh /
168 (ipvs->sysctl_amemthresh-availmem);
171 ipvs->sysctl_drop_packet = 1;
175 ipvs->drop_rate = ipvs->sysctl_am_droprate;
178 spin_unlock(&ipvs->droppacket_lock);
181 spin_lock(&ipvs->securetcp_lock);
182 switch (ipvs->sysctl_secure_tcp) {
184 if (old_secure_tcp >= 2)
189 if (old_secure_tcp < 2)
191 ipvs->sysctl_secure_tcp = 2;
193 if (old_secure_tcp >= 2)
199 if (old_secure_tcp < 2)
202 if (old_secure_tcp >= 2)
204 ipvs->sysctl_secure_tcp = 1;
208 if (old_secure_tcp < 2)
212 old_secure_tcp = ipvs->sysctl_secure_tcp;
214 ip_vs_protocol_timeout_change(ipvs,
215 ipvs->sysctl_secure_tcp > 1);
216 spin_unlock(&ipvs->securetcp_lock);
223 * Timer for checking the defense
225 #define DEFENSE_TIMER_PERIOD 1*HZ
227 static void defense_work_handler(struct work_struct *work)
229 struct netns_ipvs *ipvs =
230 container_of(work, struct netns_ipvs, defense_work.work);
232 update_defense_level(ipvs);
233 if (atomic_read(&ipvs->dropentry))
234 ip_vs_random_dropentry(ipvs->net);
235 schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
240 ip_vs_use_count_inc(void)
242 return try_module_get(THIS_MODULE);
246 ip_vs_use_count_dec(void)
248 module_put(THIS_MODULE);
253 * Hash table: for virtual service lookups
255 #define IP_VS_SVC_TAB_BITS 8
256 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
257 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
259 /* the service table hashed by <protocol, addr, port> */
260 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
261 /* the service table hashed by fwmark */
262 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
266 * Returns hash value for virtual service
268 static inline unsigned int
269 ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
270 const union nf_inet_addr *addr, __be16 port)
272 register unsigned int porth = ntohs(port);
273 __be32 addr_fold = addr->ip;
276 #ifdef CONFIG_IP_VS_IPV6
278 addr_fold = addr->ip6[0]^addr->ip6[1]^
279 addr->ip6[2]^addr->ip6[3];
281 ahash = ntohl(addr_fold);
282 ahash ^= ((size_t) net >> 8);
284 return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
289 * Returns hash value of fwmark for virtual service lookup
291 static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
293 return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
297 * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
298 * or in the ip_vs_svc_fwm_table by fwmark.
299 * Should be called with locked tables.
301 static int ip_vs_svc_hash(struct ip_vs_service *svc)
305 if (svc->flags & IP_VS_SVC_F_HASHED) {
306 pr_err("%s(): request for already hashed, called from %pF\n",
307 __func__, __builtin_return_address(0));
311 if (svc->fwmark == 0) {
313 * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
315 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
316 &svc->addr, svc->port);
317 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
320 * Hash it by fwmark in svc_fwm_table
322 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
323 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
326 svc->flags |= IP_VS_SVC_F_HASHED;
327 /* increase its refcnt because it is referenced by the svc table */
328 atomic_inc(&svc->refcnt);
334 * Unhashes a service from svc_table / svc_fwm_table.
335 * Should be called with locked tables.
337 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
339 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
340 pr_err("%s(): request for unhash flagged, called from %pF\n",
341 __func__, __builtin_return_address(0));
345 if (svc->fwmark == 0) {
346 /* Remove it from the svc_table table */
347 list_del(&svc->s_list);
349 /* Remove it from the svc_fwm_table table */
350 list_del(&svc->f_list);
353 svc->flags &= ~IP_VS_SVC_F_HASHED;
354 atomic_dec(&svc->refcnt);
360 * Get service by {netns, proto,addr,port} in the service table.
362 static inline struct ip_vs_service *
363 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
364 const union nf_inet_addr *vaddr, __be16 vport)
367 struct ip_vs_service *svc;
369 /* Check for "full" addressed entries */
370 hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
372 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
374 && ip_vs_addr_equal(af, &svc->addr, vaddr)
375 && (svc->port == vport)
376 && (svc->protocol == protocol)
377 && net_eq(svc->net, net)) {
388 * Get service by {fwmark} in the service table.
390 static inline struct ip_vs_service *
391 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
394 struct ip_vs_service *svc;
396 /* Check for fwmark addressed entries */
397 hash = ip_vs_svc_fwm_hashkey(net, fwmark);
399 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400 if (svc->fwmark == fwmark && svc->af == af
401 && net_eq(svc->net, net)) {
410 struct ip_vs_service *
411 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
412 const union nf_inet_addr *vaddr, __be16 vport)
414 struct ip_vs_service *svc;
415 struct netns_ipvs *ipvs = net_ipvs(net);
417 read_lock(&__ip_vs_svc_lock);
420 * Check the table hashed by fwmark first
423 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
429 * Check the table hashed by <protocol,addr,port>
430 * for "full" addressed entries
432 svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
435 && protocol == IPPROTO_TCP
436 && atomic_read(&ipvs->ftpsvc_counter)
437 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
439 * Check if ftp service entry exists, the packet
440 * might belong to FTP data connections.
442 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
446 && atomic_read(&ipvs->nullsvc_counter)) {
448 * Check if the catch-all port (port zero) exists
450 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
455 atomic_inc(&svc->usecnt);
456 read_unlock(&__ip_vs_svc_lock);
458 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
459 fwmark, ip_vs_proto_name(protocol),
460 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
461 svc ? "hit" : "not hit");
468 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
470 atomic_inc(&svc->refcnt);
475 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
477 struct ip_vs_service *svc = dest->svc;
480 if (atomic_dec_and_test(&svc->refcnt)) {
481 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
483 IP_VS_DBG_ADDR(svc->af, &svc->addr),
484 ntohs(svc->port), atomic_read(&svc->usecnt));
485 free_percpu(svc->stats.cpustats);
492 * Returns hash value for real service
494 static inline unsigned int ip_vs_rs_hashkey(int af,
495 const union nf_inet_addr *addr,
498 register unsigned int porth = ntohs(port);
499 __be32 addr_fold = addr->ip;
501 #ifdef CONFIG_IP_VS_IPV6
503 addr_fold = addr->ip6[0]^addr->ip6[1]^
504 addr->ip6[2]^addr->ip6[3];
507 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
512 * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
513 * should be called with locked tables.
515 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
519 if (!list_empty(&dest->d_list)) {
524 * Hash by proto,addr,port,
525 * which are the parameters of the real service.
527 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
529 list_add(&dest->d_list, &ipvs->rs_table[hash]);
535 * UNhashes ip_vs_dest from rs_table.
536 * should be called with locked tables.
538 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
541 * Remove it from the rs_table table.
543 if (!list_empty(&dest->d_list)) {
544 list_del_init(&dest->d_list);
551 * Lookup real service by <proto,addr,port> in the real service table.
554 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
555 const union nf_inet_addr *daddr,
558 struct netns_ipvs *ipvs = net_ipvs(net);
560 struct ip_vs_dest *dest;
563 * Check for "full" addressed entries
564 * Return the first found entry
566 hash = ip_vs_rs_hashkey(af, daddr, dport);
568 read_lock(&ipvs->rs_lock);
569 list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
571 && ip_vs_addr_equal(af, &dest->addr, daddr)
572 && (dest->port == dport)
573 && ((dest->protocol == protocol) ||
576 read_unlock(&ipvs->rs_lock);
580 read_unlock(&ipvs->rs_lock);
586 * Lookup destination by {addr,port} in the given service
588 static struct ip_vs_dest *
589 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
592 struct ip_vs_dest *dest;
595 * Find the destination for the given service
597 list_for_each_entry(dest, &svc->destinations, n_list) {
598 if ((dest->af == svc->af)
599 && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
600 && (dest->port == dport)) {
610 * Find destination by {daddr,dport,vaddr,protocol}
611 * Cretaed to be used in ip_vs_process_message() in
612 * the backup synchronization daemon. It finds the
613 * destination to be bound to the received connection
616 * ip_vs_lookup_real_service() looked promissing, but
617 * seems not working as expected.
619 struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
620 const union nf_inet_addr *daddr,
622 const union nf_inet_addr *vaddr,
623 __be16 vport, __u16 protocol, __u32 fwmark,
626 struct ip_vs_dest *dest;
627 struct ip_vs_service *svc;
630 svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
633 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
635 dest = ip_vs_lookup_dest(svc, daddr, port);
637 dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
639 atomic_inc(&dest->refcnt);
640 ip_vs_service_put(svc);
645 * Lookup dest by {svc,addr,port} in the destination trash.
646 * The destination trash is used to hold the destinations that are removed
647 * from the service table but are still referenced by some conn entries.
648 * The reason to add the destination trash is when the dest is temporary
649 * down (either by administrator or by monitor program), the dest can be
650 * picked back from the trash, the remaining connections to the dest can
651 * continue, and the counting information of the dest is also useful for
654 static struct ip_vs_dest *
655 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
658 struct ip_vs_dest *dest, *nxt;
659 struct netns_ipvs *ipvs = net_ipvs(svc->net);
662 * Find the destination in trash
664 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
665 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
668 IP_VS_DBG_ADDR(svc->af, &dest->addr),
670 atomic_read(&dest->refcnt));
671 if (dest->af == svc->af &&
672 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
673 dest->port == dport &&
674 dest->vfwmark == svc->fwmark &&
675 dest->protocol == svc->protocol &&
677 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
678 dest->vport == svc->port))) {
684 * Try to purge the destination from trash if not referenced
686 if (atomic_read(&dest->refcnt) == 1) {
687 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
690 IP_VS_DBG_ADDR(svc->af, &dest->addr),
692 list_del(&dest->n_list);
693 ip_vs_dst_reset(dest);
694 __ip_vs_unbind_svc(dest);
695 free_percpu(dest->stats.cpustats);
705 * Clean up all the destinations in the trash
706 * Called by the ip_vs_control_cleanup()
708 * When the ip_vs_control_clearup is activated by ipvs module exit,
709 * the service tables must have been flushed and all the connections
710 * are expired, and the refcnt of each destination in the trash must
711 * be 1, so we simply release them here.
713 static void ip_vs_trash_cleanup(struct net *net)
715 struct ip_vs_dest *dest, *nxt;
716 struct netns_ipvs *ipvs = net_ipvs(net);
718 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
719 list_del(&dest->n_list);
720 ip_vs_dst_reset(dest);
721 __ip_vs_unbind_svc(dest);
722 free_percpu(dest->stats.cpustats);
728 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
730 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
732 spin_lock_bh(&src->lock);
734 IP_VS_SHOW_STATS_COUNTER(conns);
735 IP_VS_SHOW_STATS_COUNTER(inpkts);
736 IP_VS_SHOW_STATS_COUNTER(outpkts);
737 IP_VS_SHOW_STATS_COUNTER(inbytes);
738 IP_VS_SHOW_STATS_COUNTER(outbytes);
740 ip_vs_read_estimator(dst, src);
742 spin_unlock_bh(&src->lock);
746 ip_vs_zero_stats(struct ip_vs_stats *stats)
748 spin_lock_bh(&stats->lock);
750 /* get current counters as zero point, rates are zeroed */
752 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
754 IP_VS_ZERO_STATS_COUNTER(conns);
755 IP_VS_ZERO_STATS_COUNTER(inpkts);
756 IP_VS_ZERO_STATS_COUNTER(outpkts);
757 IP_VS_ZERO_STATS_COUNTER(inbytes);
758 IP_VS_ZERO_STATS_COUNTER(outbytes);
760 ip_vs_zero_estimator(stats);
762 spin_unlock_bh(&stats->lock);
766 * Update a destination in the given service
769 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
770 struct ip_vs_dest_user_kern *udest, int add)
772 struct netns_ipvs *ipvs = net_ipvs(svc->net);
775 /* set the weight and the flags */
776 atomic_set(&dest->weight, udest->weight);
777 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
778 conn_flags |= IP_VS_CONN_F_INACTIVE;
780 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
781 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
782 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
785 * Put the real service in rs_table if not present.
786 * For now only for NAT!
788 write_lock_bh(&ipvs->rs_lock);
789 ip_vs_rs_hash(ipvs, dest);
790 write_unlock_bh(&ipvs->rs_lock);
792 atomic_set(&dest->conn_flags, conn_flags);
794 /* bind the service */
796 __ip_vs_bind_svc(dest, svc);
798 if (dest->svc != svc) {
799 __ip_vs_unbind_svc(dest);
800 ip_vs_zero_stats(&dest->stats);
801 __ip_vs_bind_svc(dest, svc);
805 /* set the dest status flags */
806 dest->flags |= IP_VS_DEST_F_AVAILABLE;
808 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
809 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
810 dest->u_threshold = udest->u_threshold;
811 dest->l_threshold = udest->l_threshold;
813 spin_lock_bh(&dest->dst_lock);
814 ip_vs_dst_reset(dest);
815 spin_unlock_bh(&dest->dst_lock);
818 ip_vs_start_estimator(svc->net, &dest->stats);
820 write_lock_bh(&__ip_vs_svc_lock);
822 /* Wait until all other svc users go away */
823 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
826 list_add(&dest->n_list, &svc->destinations);
830 /* call the update_service, because server weight may be changed */
831 if (svc->scheduler->update_service)
832 svc->scheduler->update_service(svc);
834 write_unlock_bh(&__ip_vs_svc_lock);
839 * Create a destination for the given service
842 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
843 struct ip_vs_dest **dest_p)
845 struct ip_vs_dest *dest;
850 #ifdef CONFIG_IP_VS_IPV6
851 if (svc->af == AF_INET6) {
852 atype = ipv6_addr_type(&udest->addr.in6);
853 if ((!(atype & IPV6_ADDR_UNICAST) ||
854 atype & IPV6_ADDR_LINKLOCAL) &&
855 !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
860 atype = inet_addr_type(svc->net, udest->addr.ip);
861 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
865 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
869 dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
870 if (!dest->stats.cpustats)
874 dest->protocol = svc->protocol;
875 dest->vaddr = svc->addr;
876 dest->vport = svc->port;
877 dest->vfwmark = svc->fwmark;
878 ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
879 dest->port = udest->port;
881 atomic_set(&dest->activeconns, 0);
882 atomic_set(&dest->inactconns, 0);
883 atomic_set(&dest->persistconns, 0);
884 atomic_set(&dest->refcnt, 1);
886 INIT_LIST_HEAD(&dest->d_list);
887 spin_lock_init(&dest->dst_lock);
888 spin_lock_init(&dest->stats.lock);
889 __ip_vs_update_dest(svc, dest, udest, 1);
903 * Add a destination into an existing service
906 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
908 struct ip_vs_dest *dest;
909 union nf_inet_addr daddr;
910 __be16 dport = udest->port;
915 if (udest->weight < 0) {
916 pr_err("%s(): server weight less than zero\n", __func__);
920 if (udest->l_threshold > udest->u_threshold) {
921 pr_err("%s(): lower threshold is higher than upper threshold\n",
926 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
929 * Check if the dest already exists in the list
931 dest = ip_vs_lookup_dest(svc, &daddr, dport);
934 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
939 * Check if the dest already exists in the trash and
940 * is from the same service
942 dest = ip_vs_trash_get_dest(svc, &daddr, dport);
945 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
946 "dest->refcnt=%d, service %u/%s:%u\n",
947 IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
948 atomic_read(&dest->refcnt),
950 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
954 * Get the destination from the trash
956 list_del(&dest->n_list);
958 __ip_vs_update_dest(svc, dest, udest, 1);
962 * Allocate and initialize the dest structure
964 ret = ip_vs_new_dest(svc, udest, &dest);
973 * Edit a destination in the given service
976 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
978 struct ip_vs_dest *dest;
979 union nf_inet_addr daddr;
980 __be16 dport = udest->port;
984 if (udest->weight < 0) {
985 pr_err("%s(): server weight less than zero\n", __func__);
989 if (udest->l_threshold > udest->u_threshold) {
990 pr_err("%s(): lower threshold is higher than upper threshold\n",
995 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
998 * Lookup the destination list
1000 dest = ip_vs_lookup_dest(svc, &daddr, dport);
1003 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1007 __ip_vs_update_dest(svc, dest, udest, 0);
1015 * Delete a destination (must be already unlinked from the service)
1017 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1019 struct netns_ipvs *ipvs = net_ipvs(net);
1021 ip_vs_stop_estimator(net, &dest->stats);
1024 * Remove it from the d-linked list with the real services.
1026 write_lock_bh(&ipvs->rs_lock);
1027 ip_vs_rs_unhash(dest);
1028 write_unlock_bh(&ipvs->rs_lock);
1031 * Decrease the refcnt of the dest, and free the dest
1032 * if nobody refers to it (refcnt=0). Otherwise, throw
1033 * the destination into the trash.
1035 if (atomic_dec_and_test(&dest->refcnt)) {
1036 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1038 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1040 ip_vs_dst_reset(dest);
1041 /* simply decrease svc->refcnt here, let the caller check
1042 and release the service if nobody refers to it.
1043 Only user context can release destination and service,
1044 and only one user context can update virtual service at a
1045 time, so the operation here is OK */
1046 atomic_dec(&dest->svc->refcnt);
1047 free_percpu(dest->stats.cpustats);
1050 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1051 "dest->refcnt=%d\n",
1052 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1054 atomic_read(&dest->refcnt));
1055 list_add(&dest->n_list, &ipvs->dest_trash);
1056 atomic_inc(&dest->refcnt);
1062 * Unlink a destination from the given service
1064 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1065 struct ip_vs_dest *dest,
1068 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1071 * Remove it from the d-linked destination list.
1073 list_del(&dest->n_list);
1077 * Call the update_service function of its scheduler
1079 if (svcupd && svc->scheduler->update_service)
1080 svc->scheduler->update_service(svc);
1085 * Delete a destination server in the given service
1088 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1090 struct ip_vs_dest *dest;
1091 __be16 dport = udest->port;
1095 dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1098 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1102 write_lock_bh(&__ip_vs_svc_lock);
1105 * Wait until all other svc users go away.
1107 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1110 * Unlink dest from the service
1112 __ip_vs_unlink_dest(svc, dest, 1);
1114 write_unlock_bh(&__ip_vs_svc_lock);
1117 * Delete the destination
1119 __ip_vs_del_dest(svc->net, dest);
1128 * Add a service into the service hash table
1131 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1132 struct ip_vs_service **svc_p)
1135 struct ip_vs_scheduler *sched = NULL;
1136 struct ip_vs_pe *pe = NULL;
1137 struct ip_vs_service *svc = NULL;
1138 struct netns_ipvs *ipvs = net_ipvs(net);
1140 /* increase the module use count */
1141 ip_vs_use_count_inc();
1143 /* Lookup the scheduler by 'u->sched_name' */
1144 sched = ip_vs_scheduler_get(u->sched_name);
1145 if (sched == NULL) {
1146 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1151 if (u->pe_name && *u->pe_name) {
1152 pe = ip_vs_pe_getbyname(u->pe_name);
1154 pr_info("persistence engine module ip_vs_pe_%s "
1155 "not found\n", u->pe_name);
1161 #ifdef CONFIG_IP_VS_IPV6
1162 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1168 svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1170 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1174 svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1175 if (!svc->stats.cpustats) {
1180 /* I'm the first user of the service */
1181 atomic_set(&svc->usecnt, 0);
1182 atomic_set(&svc->refcnt, 0);
1185 svc->protocol = u->protocol;
1186 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1187 svc->port = u->port;
1188 svc->fwmark = u->fwmark;
1189 svc->flags = u->flags;
1190 svc->timeout = u->timeout * HZ;
1191 svc->netmask = u->netmask;
1194 INIT_LIST_HEAD(&svc->destinations);
1195 rwlock_init(&svc->sched_lock);
1196 spin_lock_init(&svc->stats.lock);
1198 /* Bind the scheduler */
1199 ret = ip_vs_bind_scheduler(svc, sched);
1204 /* Bind the ct retriever */
1205 ip_vs_bind_pe(svc, pe);
1208 /* Update the virtual service counters */
1209 if (svc->port == FTPPORT)
1210 atomic_inc(&ipvs->ftpsvc_counter);
1211 else if (svc->port == 0)
1212 atomic_inc(&ipvs->nullsvc_counter);
1214 ip_vs_start_estimator(net, &svc->stats);
1216 /* Count only IPv4 services for old get/setsockopt interface */
1217 if (svc->af == AF_INET)
1218 ipvs->num_services++;
1220 /* Hash the service into the service table */
1221 write_lock_bh(&__ip_vs_svc_lock);
1222 ip_vs_svc_hash(svc);
1223 write_unlock_bh(&__ip_vs_svc_lock);
1226 /* Now there is a service - full throttle */
1233 ip_vs_unbind_scheduler(svc);
1236 ip_vs_app_inc_put(svc->inc);
1239 if (svc->stats.cpustats)
1240 free_percpu(svc->stats.cpustats);
1243 ip_vs_scheduler_put(sched);
1246 /* decrease the module use count */
1247 ip_vs_use_count_dec();
1254 * Edit a service and bind it with a new scheduler
1257 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1259 struct ip_vs_scheduler *sched, *old_sched;
1260 struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1264 * Lookup the scheduler, by 'u->sched_name'
1266 sched = ip_vs_scheduler_get(u->sched_name);
1267 if (sched == NULL) {
1268 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1273 if (u->pe_name && *u->pe_name) {
1274 pe = ip_vs_pe_getbyname(u->pe_name);
1276 pr_info("persistence engine module ip_vs_pe_%s "
1277 "not found\n", u->pe_name);
1284 #ifdef CONFIG_IP_VS_IPV6
1285 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1291 write_lock_bh(&__ip_vs_svc_lock);
1294 * Wait until all other svc users go away.
1296 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1299 * Set the flags and timeout value
1301 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1302 svc->timeout = u->timeout * HZ;
1303 svc->netmask = u->netmask;
1305 old_sched = svc->scheduler;
1306 if (sched != old_sched) {
1308 * Unbind the old scheduler
1310 if ((ret = ip_vs_unbind_scheduler(svc))) {
1316 * Bind the new scheduler
1318 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1320 * If ip_vs_bind_scheduler fails, restore the old
1322 * The main reason of failure is out of memory.
1324 * The question is if the old scheduler can be
1325 * restored all the time. TODO: if it cannot be
1326 * restored some time, we must delete the service,
1327 * otherwise the system may crash.
1329 ip_vs_bind_scheduler(svc, old_sched);
1337 ip_vs_unbind_pe(svc);
1338 ip_vs_bind_pe(svc, pe);
1342 write_unlock_bh(&__ip_vs_svc_lock);
1344 ip_vs_scheduler_put(old_sched);
1345 ip_vs_pe_put(old_pe);
1351 * Delete a service from the service list
1352 * - The service must be unlinked, unlocked and not referenced!
1353 * - We are called under _bh lock
1355 static void __ip_vs_del_service(struct ip_vs_service *svc)
1357 struct ip_vs_dest *dest, *nxt;
1358 struct ip_vs_scheduler *old_sched;
1359 struct ip_vs_pe *old_pe;
1360 struct netns_ipvs *ipvs = net_ipvs(svc->net);
1362 pr_info("%s: enter\n", __func__);
1364 /* Count only IPv4 services for old get/setsockopt interface */
1365 if (svc->af == AF_INET)
1366 ipvs->num_services--;
1368 ip_vs_stop_estimator(svc->net, &svc->stats);
1370 /* Unbind scheduler */
1371 old_sched = svc->scheduler;
1372 ip_vs_unbind_scheduler(svc);
1373 ip_vs_scheduler_put(old_sched);
1375 /* Unbind persistence engine */
1377 ip_vs_unbind_pe(svc);
1378 ip_vs_pe_put(old_pe);
1380 /* Unbind app inc */
1382 ip_vs_app_inc_put(svc->inc);
1387 * Unlink the whole destination list
1389 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1390 __ip_vs_unlink_dest(svc, dest, 0);
1391 __ip_vs_del_dest(svc->net, dest);
1395 * Update the virtual service counters
1397 if (svc->port == FTPPORT)
1398 atomic_dec(&ipvs->ftpsvc_counter);
1399 else if (svc->port == 0)
1400 atomic_dec(&ipvs->nullsvc_counter);
1403 * Free the service if nobody refers to it
1405 if (atomic_read(&svc->refcnt) == 0) {
1406 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1408 IP_VS_DBG_ADDR(svc->af, &svc->addr),
1409 ntohs(svc->port), atomic_read(&svc->usecnt));
1410 free_percpu(svc->stats.cpustats);
1414 /* decrease the module use count */
1415 ip_vs_use_count_dec();
1419 * Unlink a service from list and try to delete it if its refcnt reached 0
1421 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1424 * Unhash it from the service table
1426 write_lock_bh(&__ip_vs_svc_lock);
1428 ip_vs_svc_unhash(svc);
1431 * Wait until all the svc users go away.
1433 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1435 __ip_vs_del_service(svc);
1437 write_unlock_bh(&__ip_vs_svc_lock);
1441 * Delete a service from the service list
1443 static int ip_vs_del_service(struct ip_vs_service *svc)
1447 ip_vs_unlink_service(svc);
1454 * Flush all the virtual services
1456 static int ip_vs_flush(struct net *net)
1459 struct ip_vs_service *svc, *nxt;
1462 * Flush the service table hashed by <netns,protocol,addr,port>
1464 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1465 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1467 if (net_eq(svc->net, net))
1468 ip_vs_unlink_service(svc);
1473 * Flush the service table hashed by fwmark
1475 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1476 list_for_each_entry_safe(svc, nxt,
1477 &ip_vs_svc_fwm_table[idx], f_list) {
1478 if (net_eq(svc->net, net))
1479 ip_vs_unlink_service(svc);
1487 * Delete service by {netns} in the service table.
1488 * Called by __ip_vs_cleanup()
1490 void ip_vs_service_net_cleanup(struct net *net)
1493 /* Check for "full" addressed entries */
1494 mutex_lock(&__ip_vs_mutex);
1496 mutex_unlock(&__ip_vs_mutex);
1500 * Release dst hold by dst_cache
1503 __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
1505 spin_lock_bh(&dest->dst_lock);
1506 if (dest->dst_cache && dest->dst_cache->dev == dev) {
1507 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1509 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1511 atomic_read(&dest->refcnt));
1512 ip_vs_dst_reset(dest);
1514 spin_unlock_bh(&dest->dst_lock);
1518 * Netdev event receiver
1519 * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
1520 * a device that is "unregister" it must be released.
1522 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1525 struct net_device *dev = ptr;
1526 struct net *net = dev_net(dev);
1527 struct netns_ipvs *ipvs = net_ipvs(net);
1528 struct ip_vs_service *svc;
1529 struct ip_vs_dest *dest;
1532 if (event != NETDEV_UNREGISTER || !ipvs)
1534 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1536 mutex_lock(&__ip_vs_mutex);
1537 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1538 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1539 if (net_eq(svc->net, net)) {
1540 list_for_each_entry(dest, &svc->destinations,
1542 __ip_vs_dev_reset(dest, dev);
1547 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1548 if (net_eq(svc->net, net)) {
1549 list_for_each_entry(dest, &svc->destinations,
1551 __ip_vs_dev_reset(dest, dev);
1558 list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
1559 __ip_vs_dev_reset(dest, dev);
1561 mutex_unlock(&__ip_vs_mutex);
1567 * Zero counters in a service or all services
1569 static int ip_vs_zero_service(struct ip_vs_service *svc)
1571 struct ip_vs_dest *dest;
1573 write_lock_bh(&__ip_vs_svc_lock);
1574 list_for_each_entry(dest, &svc->destinations, n_list) {
1575 ip_vs_zero_stats(&dest->stats);
1577 ip_vs_zero_stats(&svc->stats);
1578 write_unlock_bh(&__ip_vs_svc_lock);
1582 static int ip_vs_zero_all(struct net *net)
1585 struct ip_vs_service *svc;
1587 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1588 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1589 if (net_eq(svc->net, net))
1590 ip_vs_zero_service(svc);
1594 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1595 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1596 if (net_eq(svc->net, net))
1597 ip_vs_zero_service(svc);
1601 ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1605 #ifdef CONFIG_SYSCTL
1608 static int three = 3;
1611 proc_do_defense_mode(ctl_table *table, int write,
1612 void __user *buffer, size_t *lenp, loff_t *ppos)
1614 struct net *net = current->nsproxy->net_ns;
1615 int *valp = table->data;
1619 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1620 if (write && (*valp != val)) {
1621 if ((*valp < 0) || (*valp > 3)) {
1622 /* Restore the correct value */
1625 update_defense_level(net_ipvs(net));
1632 proc_do_sync_threshold(ctl_table *table, int write,
1633 void __user *buffer, size_t *lenp, loff_t *ppos)
1635 int *valp = table->data;
1639 /* backup the value first */
1640 memcpy(val, valp, sizeof(val));
1642 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1643 if (write && (valp[0] < 0 || valp[1] < 0 ||
1644 (valp[0] >= valp[1] && valp[1]))) {
1645 /* Restore the correct value */
1646 memcpy(valp, val, sizeof(val));
1652 proc_do_sync_mode(ctl_table *table, int write,
1653 void __user *buffer, size_t *lenp, loff_t *ppos)
1655 int *valp = table->data;
1659 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1660 if (write && (*valp != val)) {
1661 if ((*valp < 0) || (*valp > 1)) {
1662 /* Restore the correct value */
1670 proc_do_sync_ports(ctl_table *table, int write,
1671 void __user *buffer, size_t *lenp, loff_t *ppos)
1673 int *valp = table->data;
1677 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1678 if (write && (*valp != val)) {
1679 if (*valp < 1 || !is_power_of_2(*valp)) {
1680 /* Restore the correct value */
1688 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1689 * Do not change order or insert new entries without
1690 * align with netns init in ip_vs_control_net_init()
1693 static struct ctl_table vs_vars[] = {
1695 .procname = "amemthresh",
1696 .maxlen = sizeof(int),
1698 .proc_handler = proc_dointvec,
1701 .procname = "am_droprate",
1702 .maxlen = sizeof(int),
1704 .proc_handler = proc_dointvec,
1707 .procname = "drop_entry",
1708 .maxlen = sizeof(int),
1710 .proc_handler = proc_do_defense_mode,
1713 .procname = "drop_packet",
1714 .maxlen = sizeof(int),
1716 .proc_handler = proc_do_defense_mode,
1718 #ifdef CONFIG_IP_VS_NFCT
1720 .procname = "conntrack",
1721 .maxlen = sizeof(int),
1723 .proc_handler = &proc_dointvec,
1727 .procname = "secure_tcp",
1728 .maxlen = sizeof(int),
1730 .proc_handler = proc_do_defense_mode,
1733 .procname = "snat_reroute",
1734 .maxlen = sizeof(int),
1736 .proc_handler = &proc_dointvec,
1739 .procname = "sync_version",
1740 .maxlen = sizeof(int),
1742 .proc_handler = &proc_do_sync_mode,
1745 .procname = "sync_ports",
1746 .maxlen = sizeof(int),
1748 .proc_handler = &proc_do_sync_ports,
1751 .procname = "sync_qlen_max",
1752 .maxlen = sizeof(int),
1754 .proc_handler = proc_dointvec,
1757 .procname = "sync_sock_size",
1758 .maxlen = sizeof(int),
1760 .proc_handler = proc_dointvec,
1763 .procname = "cache_bypass",
1764 .maxlen = sizeof(int),
1766 .proc_handler = proc_dointvec,
1769 .procname = "expire_nodest_conn",
1770 .maxlen = sizeof(int),
1772 .proc_handler = proc_dointvec,
1775 .procname = "expire_quiescent_template",
1776 .maxlen = sizeof(int),
1778 .proc_handler = proc_dointvec,
1781 .procname = "sync_threshold",
1783 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1785 .proc_handler = proc_do_sync_threshold,
1788 .procname = "sync_refresh_period",
1789 .maxlen = sizeof(int),
1791 .proc_handler = proc_dointvec_jiffies,
1794 .procname = "sync_retries",
1795 .maxlen = sizeof(int),
1797 .proc_handler = proc_dointvec_minmax,
1802 .procname = "nat_icmp_send",
1803 .maxlen = sizeof(int),
1805 .proc_handler = proc_dointvec,
1808 .procname = "pmtu_disc",
1809 .maxlen = sizeof(int),
1811 .proc_handler = proc_dointvec,
1814 .procname = "backup_only",
1815 .maxlen = sizeof(int),
1817 .proc_handler = proc_dointvec,
1819 #ifdef CONFIG_IP_VS_DEBUG
1821 .procname = "debug_level",
1822 .data = &sysctl_ip_vs_debug_level,
1823 .maxlen = sizeof(int),
1825 .proc_handler = proc_dointvec,
1830 .procname = "timeout_established",
1831 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1832 .maxlen = sizeof(int),
1834 .proc_handler = proc_dointvec_jiffies,
1837 .procname = "timeout_synsent",
1838 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1839 .maxlen = sizeof(int),
1841 .proc_handler = proc_dointvec_jiffies,
1844 .procname = "timeout_synrecv",
1845 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1846 .maxlen = sizeof(int),
1848 .proc_handler = proc_dointvec_jiffies,
1851 .procname = "timeout_finwait",
1852 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1853 .maxlen = sizeof(int),
1855 .proc_handler = proc_dointvec_jiffies,
1858 .procname = "timeout_timewait",
1859 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1860 .maxlen = sizeof(int),
1862 .proc_handler = proc_dointvec_jiffies,
1865 .procname = "timeout_close",
1866 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1867 .maxlen = sizeof(int),
1869 .proc_handler = proc_dointvec_jiffies,
1872 .procname = "timeout_closewait",
1873 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1874 .maxlen = sizeof(int),
1876 .proc_handler = proc_dointvec_jiffies,
1879 .procname = "timeout_lastack",
1880 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1881 .maxlen = sizeof(int),
1883 .proc_handler = proc_dointvec_jiffies,
1886 .procname = "timeout_listen",
1887 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1888 .maxlen = sizeof(int),
1890 .proc_handler = proc_dointvec_jiffies,
1893 .procname = "timeout_synack",
1894 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1895 .maxlen = sizeof(int),
1897 .proc_handler = proc_dointvec_jiffies,
1900 .procname = "timeout_udp",
1901 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1902 .maxlen = sizeof(int),
1904 .proc_handler = proc_dointvec_jiffies,
1907 .procname = "timeout_icmp",
1908 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1909 .maxlen = sizeof(int),
1911 .proc_handler = proc_dointvec_jiffies,
1919 #ifdef CONFIG_PROC_FS
1922 struct seq_net_private p; /* Do not move this, netns depends upon it*/
1923 struct list_head *table;
1928 * Write the contents of the VS rule table to a PROCfs file.
1929 * (It is kept just for backward compatibility)
1931 static inline const char *ip_vs_fwd_name(unsigned int flags)
1933 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1934 case IP_VS_CONN_F_LOCALNODE:
1936 case IP_VS_CONN_F_TUNNEL:
1938 case IP_VS_CONN_F_DROUTE:
1946 /* Get the Nth entry in the two lists */
1947 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1949 struct net *net = seq_file_net(seq);
1950 struct ip_vs_iter *iter = seq->private;
1952 struct ip_vs_service *svc;
1954 /* look in hash by protocol */
1955 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1956 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1957 if (net_eq(svc->net, net) && pos-- == 0) {
1958 iter->table = ip_vs_svc_table;
1965 /* keep looking in fwmark */
1966 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1967 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1968 if (net_eq(svc->net, net) && pos-- == 0) {
1969 iter->table = ip_vs_svc_fwm_table;
1979 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1980 __acquires(__ip_vs_svc_lock)
1983 read_lock_bh(&__ip_vs_svc_lock);
1984 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1988 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1990 struct list_head *e;
1991 struct ip_vs_iter *iter;
1992 struct ip_vs_service *svc;
1995 if (v == SEQ_START_TOKEN)
1996 return ip_vs_info_array(seq,0);
1999 iter = seq->private;
2001 if (iter->table == ip_vs_svc_table) {
2002 /* next service in table hashed by protocol */
2003 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
2004 return list_entry(e, struct ip_vs_service, s_list);
2007 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2008 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
2014 iter->table = ip_vs_svc_fwm_table;
2019 /* next service in hashed by fwmark */
2020 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
2021 return list_entry(e, struct ip_vs_service, f_list);
2024 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2025 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
2033 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2034 __releases(__ip_vs_svc_lock)
2036 read_unlock_bh(&__ip_vs_svc_lock);
2040 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2042 if (v == SEQ_START_TOKEN) {
2044 "IP Virtual Server version %d.%d.%d (size=%d)\n",
2045 NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2047 "Prot LocalAddress:Port Scheduler Flags\n");
2049 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2051 const struct ip_vs_service *svc = v;
2052 const struct ip_vs_iter *iter = seq->private;
2053 const struct ip_vs_dest *dest;
2055 if (iter->table == ip_vs_svc_table) {
2056 #ifdef CONFIG_IP_VS_IPV6
2057 if (svc->af == AF_INET6)
2058 seq_printf(seq, "%s [%pI6]:%04X %s ",
2059 ip_vs_proto_name(svc->protocol),
2062 svc->scheduler->name);
2065 seq_printf(seq, "%s %08X:%04X %s %s ",
2066 ip_vs_proto_name(svc->protocol),
2067 ntohl(svc->addr.ip),
2069 svc->scheduler->name,
2070 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2072 seq_printf(seq, "FWM %08X %s %s",
2073 svc->fwmark, svc->scheduler->name,
2074 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2077 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2078 seq_printf(seq, "persistent %d %08X\n",
2080 ntohl(svc->netmask));
2082 seq_putc(seq, '\n');
2084 list_for_each_entry(dest, &svc->destinations, n_list) {
2085 #ifdef CONFIG_IP_VS_IPV6
2086 if (dest->af == AF_INET6)
2089 " %-7s %-6d %-10d %-10d\n",
2092 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2093 atomic_read(&dest->weight),
2094 atomic_read(&dest->activeconns),
2095 atomic_read(&dest->inactconns));
2100 "%-7s %-6d %-10d %-10d\n",
2101 ntohl(dest->addr.ip),
2103 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2104 atomic_read(&dest->weight),
2105 atomic_read(&dest->activeconns),
2106 atomic_read(&dest->inactconns));
2113 static const struct seq_operations ip_vs_info_seq_ops = {
2114 .start = ip_vs_info_seq_start,
2115 .next = ip_vs_info_seq_next,
2116 .stop = ip_vs_info_seq_stop,
2117 .show = ip_vs_info_seq_show,
2120 static int ip_vs_info_open(struct inode *inode, struct file *file)
2122 return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2123 sizeof(struct ip_vs_iter));
2126 static const struct file_operations ip_vs_info_fops = {
2127 .owner = THIS_MODULE,
2128 .open = ip_vs_info_open,
2130 .llseek = seq_lseek,
2131 .release = seq_release_net,
2134 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2136 struct net *net = seq_file_single_net(seq);
2137 struct ip_vs_stats_user show;
2139 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2141 " Total Incoming Outgoing Incoming Outgoing\n");
2143 " Conns Packets Packets Bytes Bytes\n");
2145 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2146 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2147 show.inpkts, show.outpkts,
2148 (unsigned long long) show.inbytes,
2149 (unsigned long long) show.outbytes);
2151 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2153 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
2154 seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2155 show.cps, show.inpps, show.outpps,
2156 show.inbps, show.outbps);
2161 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2163 return single_open_net(inode, file, ip_vs_stats_show);
2166 static const struct file_operations ip_vs_stats_fops = {
2167 .owner = THIS_MODULE,
2168 .open = ip_vs_stats_seq_open,
2170 .llseek = seq_lseek,
2171 .release = single_release_net,
2174 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2176 struct net *net = seq_file_single_net(seq);
2177 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2178 struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2179 struct ip_vs_stats_user rates;
2182 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2184 " Total Incoming Outgoing Incoming Outgoing\n");
2186 "CPU Conns Packets Packets Bytes Bytes\n");
2188 for_each_possible_cpu(i) {
2189 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2191 __u64 inbytes, outbytes;
2194 start = u64_stats_fetch_begin_bh(&u->syncp);
2195 inbytes = u->ustats.inbytes;
2196 outbytes = u->ustats.outbytes;
2197 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2199 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2200 i, u->ustats.conns, u->ustats.inpkts,
2201 u->ustats.outpkts, (__u64)inbytes,
2205 spin_lock_bh(&tot_stats->lock);
2207 seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
2208 tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2209 tot_stats->ustats.outpkts,
2210 (unsigned long long) tot_stats->ustats.inbytes,
2211 (unsigned long long) tot_stats->ustats.outbytes);
2213 ip_vs_read_estimator(&rates, tot_stats);
2215 spin_unlock_bh(&tot_stats->lock);
2217 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2219 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
2220 seq_printf(seq, " %8X %8X %8X %16X %16X\n",
2230 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2232 return single_open_net(inode, file, ip_vs_stats_percpu_show);
2235 static const struct file_operations ip_vs_stats_percpu_fops = {
2236 .owner = THIS_MODULE,
2237 .open = ip_vs_stats_percpu_seq_open,
2239 .llseek = seq_lseek,
2240 .release = single_release_net,
2245 * Set timeout values for tcp tcpfin udp in the timeout_table.
2247 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2249 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2250 struct ip_vs_proto_data *pd;
2253 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2258 #ifdef CONFIG_IP_VS_PROTO_TCP
2259 if (u->tcp_timeout) {
2260 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2261 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2262 = u->tcp_timeout * HZ;
2265 if (u->tcp_fin_timeout) {
2266 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2267 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2268 = u->tcp_fin_timeout * HZ;
2272 #ifdef CONFIG_IP_VS_PROTO_UDP
2273 if (u->udp_timeout) {
2274 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2275 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2276 = u->udp_timeout * HZ;
2283 #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2284 #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
2285 #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
2286 sizeof(struct ip_vs_dest_user))
2287 #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2288 #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
2289 #define MAX_ARG_LEN SVCDEST_ARG_LEN
2291 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2292 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
2293 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
2294 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
2295 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
2296 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
2297 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
2298 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
2299 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
2300 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
2301 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
2302 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
2305 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2306 struct ip_vs_service_user *usvc_compat)
2308 memset(usvc, 0, sizeof(*usvc));
2311 usvc->protocol = usvc_compat->protocol;
2312 usvc->addr.ip = usvc_compat->addr;
2313 usvc->port = usvc_compat->port;
2314 usvc->fwmark = usvc_compat->fwmark;
2316 /* Deep copy of sched_name is not needed here */
2317 usvc->sched_name = usvc_compat->sched_name;
2319 usvc->flags = usvc_compat->flags;
2320 usvc->timeout = usvc_compat->timeout;
2321 usvc->netmask = usvc_compat->netmask;
2324 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2325 struct ip_vs_dest_user *udest_compat)
2327 memset(udest, 0, sizeof(*udest));
2329 udest->addr.ip = udest_compat->addr;
2330 udest->port = udest_compat->port;
2331 udest->conn_flags = udest_compat->conn_flags;
2332 udest->weight = udest_compat->weight;
2333 udest->u_threshold = udest_compat->u_threshold;
2334 udest->l_threshold = udest_compat->l_threshold;
2338 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2340 struct net *net = sock_net(sk);
2342 unsigned char arg[MAX_ARG_LEN];
2343 struct ip_vs_service_user *usvc_compat;
2344 struct ip_vs_service_user_kern usvc;
2345 struct ip_vs_service *svc;
2346 struct ip_vs_dest_user *udest_compat;
2347 struct ip_vs_dest_user_kern udest;
2348 struct netns_ipvs *ipvs = net_ipvs(net);
2350 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2353 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2355 if (len < 0 || len > MAX_ARG_LEN)
2357 if (len != set_arglen[SET_CMDID(cmd)]) {
2358 pr_err("set_ctl: len %u != %u\n",
2359 len, set_arglen[SET_CMDID(cmd)]);
2363 if (copy_from_user(arg, user, len) != 0)
2366 /* increase the module use count */
2367 ip_vs_use_count_inc();
2369 /* Handle daemons since they have another lock */
2370 if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2371 cmd == IP_VS_SO_SET_STOPDAEMON) {
2372 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2374 if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
2378 if (cmd == IP_VS_SO_SET_STARTDAEMON)
2379 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2382 ret = stop_sync_thread(net, dm->state);
2383 mutex_unlock(&ipvs->sync_mutex);
2387 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2392 if (cmd == IP_VS_SO_SET_FLUSH) {
2393 /* Flush the virtual service */
2394 ret = ip_vs_flush(net);
2396 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2397 /* Set timeout values for (tcp tcpfin udp) */
2398 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2402 usvc_compat = (struct ip_vs_service_user *)arg;
2403 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2405 /* We only use the new structs internally, so copy userspace compat
2406 * structs to extended internal versions */
2407 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2408 ip_vs_copy_udest_compat(&udest, udest_compat);
2410 if (cmd == IP_VS_SO_SET_ZERO) {
2411 /* if no service address is set, zero counters in all */
2412 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2413 ret = ip_vs_zero_all(net);
2418 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2419 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2420 usvc.protocol != IPPROTO_SCTP) {
2421 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2422 usvc.protocol, &usvc.addr.ip,
2423 ntohs(usvc.port), usvc.sched_name);
2428 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2429 if (usvc.fwmark == 0)
2430 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2431 &usvc.addr, usvc.port);
2433 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2435 if (cmd != IP_VS_SO_SET_ADD
2436 && (svc == NULL || svc->protocol != usvc.protocol)) {
2442 case IP_VS_SO_SET_ADD:
2446 ret = ip_vs_add_service(net, &usvc, &svc);
2448 case IP_VS_SO_SET_EDIT:
2449 ret = ip_vs_edit_service(svc, &usvc);
2451 case IP_VS_SO_SET_DEL:
2452 ret = ip_vs_del_service(svc);
2456 case IP_VS_SO_SET_ZERO:
2457 ret = ip_vs_zero_service(svc);
2459 case IP_VS_SO_SET_ADDDEST:
2460 ret = ip_vs_add_dest(svc, &udest);
2462 case IP_VS_SO_SET_EDITDEST:
2463 ret = ip_vs_edit_dest(svc, &udest);
2465 case IP_VS_SO_SET_DELDEST:
2466 ret = ip_vs_del_dest(svc, &udest);
2473 mutex_unlock(&__ip_vs_mutex);
2475 /* decrease the module use count */
2476 ip_vs_use_count_dec();
2483 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2485 dst->protocol = src->protocol;
2486 dst->addr = src->addr.ip;
2487 dst->port = src->port;
2488 dst->fwmark = src->fwmark;
2489 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2490 dst->flags = src->flags;
2491 dst->timeout = src->timeout / HZ;
2492 dst->netmask = src->netmask;
2493 dst->num_dests = src->num_dests;
2494 ip_vs_copy_stats(&dst->stats, &src->stats);
2498 __ip_vs_get_service_entries(struct net *net,
2499 const struct ip_vs_get_services *get,
2500 struct ip_vs_get_services __user *uptr)
2503 struct ip_vs_service *svc;
2504 struct ip_vs_service_entry entry;
2507 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2508 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2509 /* Only expose IPv4 entries to old interface */
2510 if (svc->af != AF_INET || !net_eq(svc->net, net))
2513 if (count >= get->num_services)
2515 memset(&entry, 0, sizeof(entry));
2516 ip_vs_copy_service(&entry, svc);
2517 if (copy_to_user(&uptr->entrytable[count],
2518 &entry, sizeof(entry))) {
2526 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2527 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2528 /* Only expose IPv4 entries to old interface */
2529 if (svc->af != AF_INET || !net_eq(svc->net, net))
2532 if (count >= get->num_services)
2534 memset(&entry, 0, sizeof(entry));
2535 ip_vs_copy_service(&entry, svc);
2536 if (copy_to_user(&uptr->entrytable[count],
2537 &entry, sizeof(entry))) {
2549 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2550 struct ip_vs_get_dests __user *uptr)
2552 struct ip_vs_service *svc;
2553 union nf_inet_addr addr = { .ip = get->addr };
2557 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2559 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2564 struct ip_vs_dest *dest;
2565 struct ip_vs_dest_entry entry;
2567 list_for_each_entry(dest, &svc->destinations, n_list) {
2568 if (count >= get->num_dests)
2571 entry.addr = dest->addr.ip;
2572 entry.port = dest->port;
2573 entry.conn_flags = atomic_read(&dest->conn_flags);
2574 entry.weight = atomic_read(&dest->weight);
2575 entry.u_threshold = dest->u_threshold;
2576 entry.l_threshold = dest->l_threshold;
2577 entry.activeconns = atomic_read(&dest->activeconns);
2578 entry.inactconns = atomic_read(&dest->inactconns);
2579 entry.persistconns = atomic_read(&dest->persistconns);
2580 ip_vs_copy_stats(&entry.stats, &dest->stats);
2581 if (copy_to_user(&uptr->entrytable[count],
2582 &entry, sizeof(entry))) {
2594 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2596 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2597 struct ip_vs_proto_data *pd;
2600 memset(u, 0, sizeof (*u));
2602 #ifdef CONFIG_IP_VS_PROTO_TCP
2603 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2604 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2605 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2607 #ifdef CONFIG_IP_VS_PROTO_UDP
2608 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2610 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2615 #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2616 #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2617 #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2618 #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2619 #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2620 #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2621 #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2623 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2624 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2625 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2626 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2627 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2628 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2629 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2630 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2634 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2636 unsigned char arg[128];
2638 unsigned int copylen;
2639 struct net *net = sock_net(sk);
2640 struct netns_ipvs *ipvs = net_ipvs(net);
2643 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2646 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2649 if (*len < get_arglen[GET_CMDID(cmd)]) {
2650 pr_err("get_ctl: len %u < %u\n",
2651 *len, get_arglen[GET_CMDID(cmd)]);
2655 copylen = get_arglen[GET_CMDID(cmd)];
2659 if (copy_from_user(arg, user, copylen) != 0)
2662 * Handle daemons first since it has its own locking
2664 if (cmd == IP_VS_SO_GET_DAEMON) {
2665 struct ip_vs_daemon_user d[2];
2667 memset(&d, 0, sizeof(d));
2668 if (mutex_lock_interruptible(&ipvs->sync_mutex))
2669 return -ERESTARTSYS;
2671 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2672 d[0].state = IP_VS_STATE_MASTER;
2673 strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2674 sizeof(d[0].mcast_ifn));
2675 d[0].syncid = ipvs->master_syncid;
2677 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2678 d[1].state = IP_VS_STATE_BACKUP;
2679 strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2680 sizeof(d[1].mcast_ifn));
2681 d[1].syncid = ipvs->backup_syncid;
2683 if (copy_to_user(user, &d, sizeof(d)) != 0)
2685 mutex_unlock(&ipvs->sync_mutex);
2689 if (mutex_lock_interruptible(&__ip_vs_mutex))
2690 return -ERESTARTSYS;
2693 case IP_VS_SO_GET_VERSION:
2697 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2698 NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2699 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2703 *len = strlen(buf)+1;
2707 case IP_VS_SO_GET_INFO:
2709 struct ip_vs_getinfo info;
2710 info.version = IP_VS_VERSION_CODE;
2711 info.size = ip_vs_conn_tab_size;
2712 info.num_services = ipvs->num_services;
2713 if (copy_to_user(user, &info, sizeof(info)) != 0)
2718 case IP_VS_SO_GET_SERVICES:
2720 struct ip_vs_get_services *get;
2723 get = (struct ip_vs_get_services *)arg;
2724 size = sizeof(*get) +
2725 sizeof(struct ip_vs_service_entry) * get->num_services;
2727 pr_err("length: %u != %u\n", *len, size);
2731 ret = __ip_vs_get_service_entries(net, get, user);
2735 case IP_VS_SO_GET_SERVICE:
2737 struct ip_vs_service_entry *entry;
2738 struct ip_vs_service *svc;
2739 union nf_inet_addr addr;
2741 entry = (struct ip_vs_service_entry *)arg;
2742 addr.ip = entry->addr;
2744 svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2746 svc = __ip_vs_service_find(net, AF_INET,
2747 entry->protocol, &addr,
2750 ip_vs_copy_service(entry, svc);
2751 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2758 case IP_VS_SO_GET_DESTS:
2760 struct ip_vs_get_dests *get;
2763 get = (struct ip_vs_get_dests *)arg;
2764 size = sizeof(*get) +
2765 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2767 pr_err("length: %u != %u\n", *len, size);
2771 ret = __ip_vs_get_dest_entries(net, get, user);
2775 case IP_VS_SO_GET_TIMEOUT:
2777 struct ip_vs_timeout_user t;
2779 __ip_vs_get_timeouts(net, &t);
2780 if (copy_to_user(user, &t, sizeof(t)) != 0)
2790 mutex_unlock(&__ip_vs_mutex);
2795 static struct nf_sockopt_ops ip_vs_sockopts = {
2797 .set_optmin = IP_VS_BASE_CTL,
2798 .set_optmax = IP_VS_SO_SET_MAX+1,
2799 .set = do_ip_vs_set_ctl,
2800 .get_optmin = IP_VS_BASE_CTL,
2801 .get_optmax = IP_VS_SO_GET_MAX+1,
2802 .get = do_ip_vs_get_ctl,
2803 .owner = THIS_MODULE,
2807 * Generic Netlink interface
2810 /* IPVS genetlink family */
2811 static struct genl_family ip_vs_genl_family = {
2812 .id = GENL_ID_GENERATE,
2814 .name = IPVS_GENL_NAME,
2815 .version = IPVS_GENL_VERSION,
2816 .maxattr = IPVS_CMD_MAX,
2817 .netnsok = true, /* Make ipvsadm to work on netns */
2820 /* Policy used for first-level command attributes */
2821 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2822 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
2823 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
2824 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
2825 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
2826 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2827 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
2830 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2831 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2832 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
2833 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
2834 .len = IP_VS_IFNAME_MAXLEN },
2835 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
2838 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2839 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2840 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
2841 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
2842 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
2843 .len = sizeof(union nf_inet_addr) },
2844 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
2845 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
2846 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
2847 .len = IP_VS_SCHEDNAME_MAXLEN },
2848 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
2849 .len = IP_VS_PENAME_MAXLEN },
2850 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
2851 .len = sizeof(struct ip_vs_flags) },
2852 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
2853 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
2854 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
2857 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2858 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2859 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
2860 .len = sizeof(union nf_inet_addr) },
2861 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
2862 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
2863 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
2864 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
2865 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
2866 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
2867 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
2868 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
2869 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
2872 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2873 struct ip_vs_stats *stats)
2875 struct ip_vs_stats_user ustats;
2876 struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2880 ip_vs_copy_stats(&ustats, stats);
2882 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
2883 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
2884 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
2885 nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
2886 nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
2887 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
2888 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
2889 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
2890 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
2891 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
2892 goto nla_put_failure;
2893 nla_nest_end(skb, nl_stats);
2898 nla_nest_cancel(skb, nl_stats);
2902 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2903 struct ip_vs_service *svc)
2905 struct nlattr *nl_service;
2906 struct ip_vs_flags flags = { .flags = svc->flags,
2909 nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2913 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
2914 goto nla_put_failure;
2916 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
2917 goto nla_put_failure;
2919 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
2920 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2921 nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port))
2922 goto nla_put_failure;
2925 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
2927 nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
2928 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
2929 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2930 nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
2931 goto nla_put_failure;
2932 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2933 goto nla_put_failure;
2935 nla_nest_end(skb, nl_service);
2940 nla_nest_cancel(skb, nl_service);
2944 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2945 struct ip_vs_service *svc,
2946 struct netlink_callback *cb)
2950 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2951 &ip_vs_genl_family, NLM_F_MULTI,
2952 IPVS_CMD_NEW_SERVICE);
2956 if (ip_vs_genl_fill_service(skb, svc) < 0)
2957 goto nla_put_failure;
2959 return genlmsg_end(skb, hdr);
2962 genlmsg_cancel(skb, hdr);
2966 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2967 struct netlink_callback *cb)
2970 int start = cb->args[0];
2971 struct ip_vs_service *svc;
2972 struct net *net = skb_sknet(skb);
2974 mutex_lock(&__ip_vs_mutex);
2975 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2976 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2977 if (++idx <= start || !net_eq(svc->net, net))
2979 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2981 goto nla_put_failure;
2986 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2987 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2988 if (++idx <= start || !net_eq(svc->net, net))
2990 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2992 goto nla_put_failure;
2998 mutex_unlock(&__ip_vs_mutex);
3004 static int ip_vs_genl_parse_service(struct net *net,
3005 struct ip_vs_service_user_kern *usvc,
3006 struct nlattr *nla, int full_entry,
3007 struct ip_vs_service **ret_svc)
3009 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3010 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3011 struct ip_vs_service *svc;
3013 /* Parse mandatory identifying service fields first */
3015 nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
3018 nla_af = attrs[IPVS_SVC_ATTR_AF];
3019 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
3020 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
3021 nla_port = attrs[IPVS_SVC_ATTR_PORT];
3022 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
3024 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3027 memset(usvc, 0, sizeof(*usvc));
3029 usvc->af = nla_get_u16(nla_af);
3030 #ifdef CONFIG_IP_VS_IPV6
3031 if (usvc->af != AF_INET && usvc->af != AF_INET6)
3033 if (usvc->af != AF_INET)
3035 return -EAFNOSUPPORT;
3038 usvc->protocol = IPPROTO_TCP;
3039 usvc->fwmark = nla_get_u32(nla_fwmark);
3041 usvc->protocol = nla_get_u16(nla_protocol);
3042 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3043 usvc->port = nla_get_u16(nla_port);
3048 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
3050 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
3051 &usvc->addr, usvc->port);
3054 /* If a full entry was requested, check for the additional fields */
3056 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3058 struct ip_vs_flags flags;
3060 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3061 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3062 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3063 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3064 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3066 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3069 nla_memcpy(&flags, nla_flags, sizeof(flags));
3071 /* prefill flags from service if it already exists */
3073 usvc->flags = svc->flags;
3075 /* set new flags from userland */
3076 usvc->flags = (usvc->flags & ~flags.mask) |
3077 (flags.flags & flags.mask);
3078 usvc->sched_name = nla_data(nla_sched);
3079 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3080 usvc->timeout = nla_get_u32(nla_timeout);
3081 usvc->netmask = nla_get_u32(nla_netmask);
3087 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
3090 struct ip_vs_service_user_kern usvc;
3091 struct ip_vs_service *svc;
3094 ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3095 return ret ? ERR_PTR(ret) : svc;
3098 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3100 struct nlattr *nl_dest;
3102 nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
3106 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3107 nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3108 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3109 (atomic_read(&dest->conn_flags) &
3110 IP_VS_CONN_F_FWD_MASK)) ||
3111 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3112 atomic_read(&dest->weight)) ||
3113 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3114 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3115 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3116 atomic_read(&dest->activeconns)) ||
3117 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3118 atomic_read(&dest->inactconns)) ||
3119 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3120 atomic_read(&dest->persistconns)))
3121 goto nla_put_failure;
3122 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
3123 goto nla_put_failure;
3125 nla_nest_end(skb, nl_dest);
3130 nla_nest_cancel(skb, nl_dest);
3134 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3135 struct netlink_callback *cb)
3139 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3140 &ip_vs_genl_family, NLM_F_MULTI,
3145 if (ip_vs_genl_fill_dest(skb, dest) < 0)
3146 goto nla_put_failure;
3148 return genlmsg_end(skb, hdr);
3151 genlmsg_cancel(skb, hdr);
3155 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3156 struct netlink_callback *cb)
3159 int start = cb->args[0];
3160 struct ip_vs_service *svc;
3161 struct ip_vs_dest *dest;
3162 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3163 struct net *net = skb_sknet(skb);
3165 mutex_lock(&__ip_vs_mutex);
3167 /* Try to find the service for which to dump destinations */
3168 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3169 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3173 svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3174 if (IS_ERR(svc) || svc == NULL)
3177 /* Dump the destinations */
3178 list_for_each_entry(dest, &svc->destinations, n_list) {
3181 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3183 goto nla_put_failure;
3191 mutex_unlock(&__ip_vs_mutex);
3196 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3197 struct nlattr *nla, int full_entry)
3199 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3200 struct nlattr *nla_addr, *nla_port;
3202 /* Parse mandatory identifying destination fields first */
3204 nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3207 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
3208 nla_port = attrs[IPVS_DEST_ATTR_PORT];
3210 if (!(nla_addr && nla_port))
3213 memset(udest, 0, sizeof(*udest));
3215 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3216 udest->port = nla_get_u16(nla_port);
3218 /* If a full entry was requested, check for the additional fields */
3220 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3223 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3224 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
3225 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
3226 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
3228 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3231 udest->conn_flags = nla_get_u32(nla_fwd)
3232 & IP_VS_CONN_F_FWD_MASK;
3233 udest->weight = nla_get_u32(nla_weight);
3234 udest->u_threshold = nla_get_u32(nla_u_thresh);
3235 udest->l_threshold = nla_get_u32(nla_l_thresh);
3241 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3242 const char *mcast_ifn, __be32 syncid)
3244 struct nlattr *nl_daemon;
3246 nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3250 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
3251 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
3252 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
3253 goto nla_put_failure;
3254 nla_nest_end(skb, nl_daemon);
3259 nla_nest_cancel(skb, nl_daemon);
3263 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3264 const char *mcast_ifn, __be32 syncid,
3265 struct netlink_callback *cb)
3268 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3269 &ip_vs_genl_family, NLM_F_MULTI,
3270 IPVS_CMD_NEW_DAEMON);
3274 if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3275 goto nla_put_failure;
3277 return genlmsg_end(skb, hdr);
3280 genlmsg_cancel(skb, hdr);
3284 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3285 struct netlink_callback *cb)
3287 struct net *net = skb_sknet(skb);
3288 struct netns_ipvs *ipvs = net_ipvs(net);
3290 mutex_lock(&ipvs->sync_mutex);
3291 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3292 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3293 ipvs->master_mcast_ifn,
3294 ipvs->master_syncid, cb) < 0)
3295 goto nla_put_failure;
3300 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3301 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3302 ipvs->backup_mcast_ifn,
3303 ipvs->backup_syncid, cb) < 0)
3304 goto nla_put_failure;
3310 mutex_unlock(&ipvs->sync_mutex);
3315 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3317 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3318 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3319 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3322 return start_sync_thread(net,
3323 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3324 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3325 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3328 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3330 if (!attrs[IPVS_DAEMON_ATTR_STATE])
3333 return stop_sync_thread(net,
3334 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3337 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3339 struct ip_vs_timeout_user t;
3341 __ip_vs_get_timeouts(net, &t);
3343 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3344 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3346 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3348 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3350 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3351 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3353 return ip_vs_set_timeout(net, &t);
3356 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3360 struct netns_ipvs *ipvs;
3362 net = skb_sknet(skb);
3363 ipvs = net_ipvs(net);
3364 cmd = info->genlhdr->cmd;
3366 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3367 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3369 mutex_lock(&ipvs->sync_mutex);
3370 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3371 nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3372 info->attrs[IPVS_CMD_ATTR_DAEMON],
3373 ip_vs_daemon_policy)) {
3378 if (cmd == IPVS_CMD_NEW_DAEMON)
3379 ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3381 ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3383 mutex_unlock(&ipvs->sync_mutex);
3388 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3390 struct ip_vs_service *svc = NULL;
3391 struct ip_vs_service_user_kern usvc;
3392 struct ip_vs_dest_user_kern udest;
3394 int need_full_svc = 0, need_full_dest = 0;
3397 net = skb_sknet(skb);
3398 cmd = info->genlhdr->cmd;
3400 mutex_lock(&__ip_vs_mutex);
3402 if (cmd == IPVS_CMD_FLUSH) {
3403 ret = ip_vs_flush(net);
3405 } else if (cmd == IPVS_CMD_SET_CONFIG) {
3406 ret = ip_vs_genl_set_config(net, info->attrs);
3408 } else if (cmd == IPVS_CMD_ZERO &&
3409 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3410 ret = ip_vs_zero_all(net);
3414 /* All following commands require a service argument, so check if we
3415 * received a valid one. We need a full service specification when
3416 * adding / editing a service. Only identifying members otherwise. */
3417 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3420 ret = ip_vs_genl_parse_service(net, &usvc,
3421 info->attrs[IPVS_CMD_ATTR_SERVICE],
3422 need_full_svc, &svc);
3426 /* Unless we're adding a new service, the service must already exist */
3427 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3432 /* Destination commands require a valid destination argument. For
3433 * adding / editing a destination, we need a full destination
3435 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3436 cmd == IPVS_CMD_DEL_DEST) {
3437 if (cmd != IPVS_CMD_DEL_DEST)
3440 ret = ip_vs_genl_parse_dest(&udest,
3441 info->attrs[IPVS_CMD_ATTR_DEST],
3448 case IPVS_CMD_NEW_SERVICE:
3450 ret = ip_vs_add_service(net, &usvc, &svc);
3454 case IPVS_CMD_SET_SERVICE:
3455 ret = ip_vs_edit_service(svc, &usvc);
3457 case IPVS_CMD_DEL_SERVICE:
3458 ret = ip_vs_del_service(svc);
3459 /* do not use svc, it can be freed */
3461 case IPVS_CMD_NEW_DEST:
3462 ret = ip_vs_add_dest(svc, &udest);
3464 case IPVS_CMD_SET_DEST:
3465 ret = ip_vs_edit_dest(svc, &udest);
3467 case IPVS_CMD_DEL_DEST:
3468 ret = ip_vs_del_dest(svc, &udest);
3471 ret = ip_vs_zero_service(svc);
3478 mutex_unlock(&__ip_vs_mutex);
3483 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3485 struct sk_buff *msg;
3487 int ret, cmd, reply_cmd;
3490 net = skb_sknet(skb);
3491 cmd = info->genlhdr->cmd;
3493 if (cmd == IPVS_CMD_GET_SERVICE)
3494 reply_cmd = IPVS_CMD_NEW_SERVICE;
3495 else if (cmd == IPVS_CMD_GET_INFO)
3496 reply_cmd = IPVS_CMD_SET_INFO;
3497 else if (cmd == IPVS_CMD_GET_CONFIG)
3498 reply_cmd = IPVS_CMD_SET_CONFIG;
3500 pr_err("unknown Generic Netlink command\n");
3504 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3508 mutex_lock(&__ip_vs_mutex);
3510 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3512 goto nla_put_failure;
3515 case IPVS_CMD_GET_SERVICE:
3517 struct ip_vs_service *svc;
3519 svc = ip_vs_genl_find_service(net,
3520 info->attrs[IPVS_CMD_ATTR_SERVICE]);
3525 ret = ip_vs_genl_fill_service(msg, svc);
3527 goto nla_put_failure;
3536 case IPVS_CMD_GET_CONFIG:
3538 struct ip_vs_timeout_user t;
3540 __ip_vs_get_timeouts(net, &t);
3541 #ifdef CONFIG_IP_VS_PROTO_TCP
3542 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
3544 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3546 goto nla_put_failure;
3548 #ifdef CONFIG_IP_VS_PROTO_UDP
3549 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
3550 goto nla_put_failure;
3556 case IPVS_CMD_GET_INFO:
3557 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
3558 IP_VS_VERSION_CODE) ||
3559 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3560 ip_vs_conn_tab_size))
3561 goto nla_put_failure;
3565 genlmsg_end(msg, reply);
3566 ret = genlmsg_reply(msg, info);
3570 pr_err("not enough space in Netlink message\n");
3576 mutex_unlock(&__ip_vs_mutex);
3582 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3584 .cmd = IPVS_CMD_NEW_SERVICE,
3585 .flags = GENL_ADMIN_PERM,
3586 .policy = ip_vs_cmd_policy,
3587 .doit = ip_vs_genl_set_cmd,
3590 .cmd = IPVS_CMD_SET_SERVICE,
3591 .flags = GENL_ADMIN_PERM,
3592 .policy = ip_vs_cmd_policy,
3593 .doit = ip_vs_genl_set_cmd,
3596 .cmd = IPVS_CMD_DEL_SERVICE,
3597 .flags = GENL_ADMIN_PERM,
3598 .policy = ip_vs_cmd_policy,
3599 .doit = ip_vs_genl_set_cmd,
3602 .cmd = IPVS_CMD_GET_SERVICE,
3603 .flags = GENL_ADMIN_PERM,
3604 .doit = ip_vs_genl_get_cmd,
3605 .dumpit = ip_vs_genl_dump_services,
3606 .policy = ip_vs_cmd_policy,
3609 .cmd = IPVS_CMD_NEW_DEST,
3610 .flags = GENL_ADMIN_PERM,
3611 .policy = ip_vs_cmd_policy,
3612 .doit = ip_vs_genl_set_cmd,
3615 .cmd = IPVS_CMD_SET_DEST,
3616 .flags = GENL_ADMIN_PERM,
3617 .policy = ip_vs_cmd_policy,
3618 .doit = ip_vs_genl_set_cmd,
3621 .cmd = IPVS_CMD_DEL_DEST,
3622 .flags = GENL_ADMIN_PERM,
3623 .policy = ip_vs_cmd_policy,
3624 .doit = ip_vs_genl_set_cmd,
3627 .cmd = IPVS_CMD_GET_DEST,
3628 .flags = GENL_ADMIN_PERM,
3629 .policy = ip_vs_cmd_policy,
3630 .dumpit = ip_vs_genl_dump_dests,
3633 .cmd = IPVS_CMD_NEW_DAEMON,
3634 .flags = GENL_ADMIN_PERM,
3635 .policy = ip_vs_cmd_policy,
3636 .doit = ip_vs_genl_set_daemon,
3639 .cmd = IPVS_CMD_DEL_DAEMON,
3640 .flags = GENL_ADMIN_PERM,
3641 .policy = ip_vs_cmd_policy,
3642 .doit = ip_vs_genl_set_daemon,
3645 .cmd = IPVS_CMD_GET_DAEMON,
3646 .flags = GENL_ADMIN_PERM,
3647 .dumpit = ip_vs_genl_dump_daemons,
3650 .cmd = IPVS_CMD_SET_CONFIG,
3651 .flags = GENL_ADMIN_PERM,
3652 .policy = ip_vs_cmd_policy,
3653 .doit = ip_vs_genl_set_cmd,
3656 .cmd = IPVS_CMD_GET_CONFIG,
3657 .flags = GENL_ADMIN_PERM,
3658 .doit = ip_vs_genl_get_cmd,
3661 .cmd = IPVS_CMD_GET_INFO,
3662 .flags = GENL_ADMIN_PERM,
3663 .doit = ip_vs_genl_get_cmd,
3666 .cmd = IPVS_CMD_ZERO,
3667 .flags = GENL_ADMIN_PERM,
3668 .policy = ip_vs_cmd_policy,
3669 .doit = ip_vs_genl_set_cmd,
3672 .cmd = IPVS_CMD_FLUSH,
3673 .flags = GENL_ADMIN_PERM,
3674 .doit = ip_vs_genl_set_cmd,
3678 static int __init ip_vs_genl_register(void)
3680 return genl_register_family_with_ops(&ip_vs_genl_family,
3681 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3684 static void ip_vs_genl_unregister(void)
3686 genl_unregister_family(&ip_vs_genl_family);
3689 /* End of Generic Netlink interface definitions */
3692 * per netns intit/exit func.
3694 #ifdef CONFIG_SYSCTL
3695 static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3698 struct netns_ipvs *ipvs = net_ipvs(net);
3699 struct ctl_table *tbl;
3701 atomic_set(&ipvs->dropentry, 0);
3702 spin_lock_init(&ipvs->dropentry_lock);
3703 spin_lock_init(&ipvs->droppacket_lock);
3704 spin_lock_init(&ipvs->securetcp_lock);
3706 if (!net_eq(net, &init_net)) {
3707 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3711 /* Don't export sysctls to unprivileged users */
3712 if (net->user_ns != &init_user_ns)
3713 tbl[0].procname = NULL;
3716 /* Initialize sysctl defaults */
3718 ipvs->sysctl_amemthresh = 1024;
3719 tbl[idx++].data = &ipvs->sysctl_amemthresh;
3720 ipvs->sysctl_am_droprate = 10;
3721 tbl[idx++].data = &ipvs->sysctl_am_droprate;
3722 tbl[idx++].data = &ipvs->sysctl_drop_entry;
3723 tbl[idx++].data = &ipvs->sysctl_drop_packet;
3724 #ifdef CONFIG_IP_VS_NFCT
3725 tbl[idx++].data = &ipvs->sysctl_conntrack;
3727 tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3728 ipvs->sysctl_snat_reroute = 1;
3729 tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3730 ipvs->sysctl_sync_ver = 1;
3731 tbl[idx++].data = &ipvs->sysctl_sync_ver;
3732 ipvs->sysctl_sync_ports = 1;
3733 tbl[idx++].data = &ipvs->sysctl_sync_ports;
3734 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
3735 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
3736 ipvs->sysctl_sync_sock_size = 0;
3737 tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3738 tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3739 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3740 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3741 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
3742 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3743 tbl[idx].data = &ipvs->sysctl_sync_threshold;
3744 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3745 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
3746 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
3747 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
3748 tbl[idx++].data = &ipvs->sysctl_sync_retries;
3749 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3750 ipvs->sysctl_pmtu_disc = 1;
3751 tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3752 tbl[idx++].data = &ipvs->sysctl_backup_only;
3755 ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3756 if (ipvs->sysctl_hdr == NULL) {
3757 if (!net_eq(net, &init_net))
3761 ip_vs_start_estimator(net, &ipvs->tot_stats);
3762 ipvs->sysctl_tbl = tbl;
3763 /* Schedule defense work */
3764 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3765 schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3770 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
3772 struct netns_ipvs *ipvs = net_ipvs(net);
3774 cancel_delayed_work_sync(&ipvs->defense_work);
3775 cancel_work_sync(&ipvs->defense_work.work);
3776 unregister_net_sysctl_table(ipvs->sysctl_hdr);
3781 static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
3782 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
3786 static struct notifier_block ip_vs_dst_notifier = {
3787 .notifier_call = ip_vs_dst_event,
3790 int __net_init ip_vs_control_net_init(struct net *net)
3793 struct netns_ipvs *ipvs = net_ipvs(net);
3795 rwlock_init(&ipvs->rs_lock);
3797 /* Initialize rs_table */
3798 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3799 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3801 INIT_LIST_HEAD(&ipvs->dest_trash);
3802 atomic_set(&ipvs->ftpsvc_counter, 0);
3803 atomic_set(&ipvs->nullsvc_counter, 0);
3806 ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3807 if (!ipvs->tot_stats.cpustats)
3810 spin_lock_init(&ipvs->tot_stats.lock);
3812 proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
3813 proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
3814 proc_create("ip_vs_stats_percpu", 0, net->proc_net,
3815 &ip_vs_stats_percpu_fops);
3817 if (ip_vs_control_net_init_sysctl(net))
3823 free_percpu(ipvs->tot_stats.cpustats);
3827 void __net_exit ip_vs_control_net_cleanup(struct net *net)
3829 struct netns_ipvs *ipvs = net_ipvs(net);
3831 ip_vs_trash_cleanup(net);
3832 ip_vs_stop_estimator(net, &ipvs->tot_stats);
3833 ip_vs_control_net_cleanup_sysctl(net);
3834 remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
3835 remove_proc_entry("ip_vs_stats", net->proc_net);
3836 remove_proc_entry("ip_vs", net->proc_net);
3837 free_percpu(ipvs->tot_stats.cpustats);
3840 int __init ip_vs_register_nl_ioctl(void)
3844 ret = nf_register_sockopt(&ip_vs_sockopts);
3846 pr_err("cannot register sockopt.\n");
3850 ret = ip_vs_genl_register();
3852 pr_err("cannot register Generic Netlink interface.\n");
3858 nf_unregister_sockopt(&ip_vs_sockopts);
3863 void ip_vs_unregister_nl_ioctl(void)
3865 ip_vs_genl_unregister();
3866 nf_unregister_sockopt(&ip_vs_sockopts);
3869 int __init ip_vs_control_init(void)
3876 /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3877 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3878 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3879 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3882 smp_wmb(); /* Do we really need it now ? */
3884 ret = register_netdevice_notifier(&ip_vs_dst_notifier);
3893 void ip_vs_control_cleanup(void)
3896 unregister_netdevice_notifier(&ip_vs_dst_notifier);