ipv4: tcp: remove per net tcp_sock

author Eric Dumazet <edumazet@google.com>

Thu, 19 Jul 2012 07:34:03 +0000 (07:34 +0000)

committer David S. Miller <davem@davemloft.net>

Thu, 19 Jul 2012 17:35:30 +0000 (10:35 -0700)
author Eric Dumazet <edumazet@google.com>
Thu, 19 Jul 2012 07:34:03 +0000 (07:34 +0000)
committer David S. Miller <davem@davemloft.net>
Thu, 19 Jul 2012 17:35:30 +0000 (10:35 -0700)
diff --git a/include/net/ip.h b/include/net/ip.h

index ec5cfde85e9af85d57c875dbb534d981d363a68f..bd5e444a19ce43482f09f695c6e3b36a303302b3 100644 (file)
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -158,7 +158,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
         return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
  }
  
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
                            __be32 saddr, const struct ip_reply_arg *arg,
                            unsigned int len);
  
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h

index 2e089a99d6038454b5aeb02e3fafdfda98119d7c..d909c7fc3da1d7dd14a80bb1dce8509dcd2d8369 100644 (file)
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -38,7 +38,6 @@ struct netns_ipv4 {
         struct sock             *fibnl;
  
         struct sock             **icmp_sk;
-       struct sock             *tcp_sock;
         struct inet_peer_base   *peers;
         struct tcpm_hash_bucket *tcp_metrics_hash;
         unsigned int            tcp_metrics_hash_mask;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c

index cc52679790b2a6608c08ddbac0e93b34f149592e..c528f841ca4b3165ccde2b36f72f9dcaecd8cb38 100644 (file)
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1463,20 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
  
  /*
   *     Generic function to send a packet as reply to another packet.
- *     Used to send TCP resets so far.
+ *     Used to send some TCP resets/acks so far.
   *
- *     Should run single threaded per socket because it uses the sock
- *             structure to pass arguments.
+ *     Use a fake percpu inet socket to avoid false sharing and contention.
   */
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+       .sk = {
+               .__sk_common = {
+                       .skc_refcnt = ATOMIC_INIT(1),
+               },
+               .sk_wmem_alloc  = ATOMIC_INIT(1),
+               .sk_allocation  = GFP_ATOMIC,
+               .sk_flags       = (1UL << SOCK_USE_WRITE_QUEUE),
+       },
+       .pmtudisc = IP_PMTUDISC_WANT,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
                            __be32 saddr, const struct ip_reply_arg *arg,
                            unsigned int len)
  {
-       struct inet_sock *inet = inet_sk(sk);
         struct ip_options_data replyopts;
         struct ipcm_cookie ipc;
         struct flowi4 fl4;
         struct rtable *rt = skb_rtable(skb);
+       struct sk_buff *nskb;
+       struct sock *sk;
+       struct inet_sock *inet;
  
         if (ip_options_echo(&replyopts.opt.opt, skb))
                 return;
@@ -1494,38 +1507,39 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
  
         flowi4_init_output(&fl4, arg->bound_dev_if, 0,
                            RT_TOS(arg->tos),
-                          RT_SCOPE_UNIVERSE, sk->sk_protocol,
+                          RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
                            ip_reply_arg_flowi_flags(arg),
                            daddr, saddr,
                            tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
         security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-       rt = ip_route_output_key(sock_net(sk), &fl4);
+       rt = ip_route_output_key(net, &fl4);
         if (IS_ERR(rt))
                 return;
  
-       /* And let IP do all the hard work.
+       inet = &get_cpu_var(unicast_sock);
  
-          This chunk is not reenterable, hence spinlock.
-          Note that it uses the fact, that this function is called
-          with locally disabled BH and that sk cannot be already spinlocked.
-        */
-       bh_lock_sock(sk);
         inet->tos = arg->tos;
+       sk = &inet->sk;
         sk->sk_priority = skb->priority;
         sk->sk_protocol = ip_hdr(skb)->protocol;
         sk->sk_bound_dev_if = arg->bound_dev_if;
+       sock_net_set(sk, net);
+       __skb_queue_head_init(&sk->sk_write_queue);
+       sk->sk_sndbuf = sysctl_wmem_default;
         ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
                        &ipc, &rt, MSG_DONTWAIT);
-       if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+       nskb = skb_peek(&sk->sk_write_queue);
+       if (nskb) {
                 if (arg->csumoffset >= 0)
-                       *((__sum16 *)skb_transport_header(skb) +
-                         arg->csumoffset) = csum_fold(csum_add(skb->csum,
+                       *((__sum16 *)skb_transport_header(nskb) +
+                         arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                 arg->csum));
-               skb->ip_summed = CHECKSUM_NONE;
+               nskb->ip_summed = CHECKSUM_NONE;
+               skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
                 ip_push_pending_frames(sk, &fl4);
         }
  
-       bh_unlock_sock(sk);
+       put_cpu_var(unicast_sock);
  
         ip_rt_put(rt);
  }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index d9caf5c07aaeb5dacd3ebb34759dc38ac1e6b507..d7d2fa50f07fedd9c0def65f5f49d208bf3195fa 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -688,7 +688,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
  
         net = dev_net(skb_dst(skb)->dev);
         arg.tos = ip_hdr(skb)->tos;
-       ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+       ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
  
         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -771,7 +771,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
         if (oif)
                 arg.bound_dev_if = oif;
         arg.tos = tos;
-       ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+       ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
  
         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -2624,13 +2624,11 @@ EXPORT_SYMBOL(tcp_prot);
  
  static int __net_init tcp_sk_init(struct net *net)
  {
-       return inet_ctl_sock_create(&net->ipv4.tcp_sock,
-                                   PF_INET, SOCK_RAW, IPPROTO_TCP, net);
+       return 0;
  }
  
  static void __net_exit tcp_sk_exit(struct net *net)
  {
-       inet_ctl_sock_destroy(net->ipv4.tcp_sock);
  }
  
  static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
author	Eric Dumazet <edumazet@google.com>
	Thu, 19 Jul 2012 07:34:03 +0000 (07:34 +0000)
committer	David S. Miller <davem@davemloft.net>
	Thu, 19 Jul 2012 17:35:30 +0000 (10:35 -0700)
include/net/ip.h		patch \| blob \| history
include/net/netns/ipv4.h		patch \| blob \| history
net/ipv4/ip_output.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history