]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - net/ipv4/tcp_ipv4.c
userns: Print out socket uids in a user namespace aware fashion.
[karo-tx-linux.git] / net / ipv4 / tcp_ipv4.c
index c8d28c433b2b0dc958f7bdebaa77f2b899dfd22e..642be8a4c6a33d2c88fa1de5f68c2579eb7b9920 100644 (file)
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        }
 
        if (tcp_death_row.sysctl_tw_recycle &&
-           !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
-               struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
-               /*
-                * VJ's idea. We save last timestamp seen from
-                * the destination in peer table, when entering state
-                * TIME-WAIT * and initialize rx_opt.ts_recent from it,
-                * when trying new connection.
-                */
-               if (peer) {
-                       inet_peer_refcheck(peer);
-                       if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
-                               tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
-                               tp->rx_opt.ts_recent = peer->tcp_ts;
-                       }
-               }
-       }
+           !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+               tcp_fetch_timewait_stamp(sk, &rt->dst);
 
        inet->inet_dport = usin->sin_port;
        inet->inet_daddr = daddr;
@@ -289,12 +275,15 @@ failure:
 EXPORT_SYMBOL(tcp_v4_connect);
 
 /*
- * This routine does path mtu discovery as defined in RFC1191.
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
  */
-static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
+static void tcp_v4_mtu_reduced(struct sock *sk)
 {
        struct dst_entry *dst;
        struct inet_sock *inet = inet_sk(sk);
+       u32 mtu = tcp_sk(sk)->mtu_info;
 
        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
         * send out by Linux are always <576bytes so they should go through
@@ -303,17 +292,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
        if (sk->sk_state == TCP_LISTEN)
                return;
 
-       /* We don't check in the destentry if pmtu discovery is forbidden
-        * on this route. We just assume that no packet_to_big packets
-        * are send back when pmtu discovery is not active.
-        * There is a small race when the user changes this flag in the
-        * route, but I think that's acceptable.
-        */
-       if ((dst = __sk_dst_check(sk, 0)) == NULL)
+       dst = inet_csk_update_pmtu(sk, mtu);
+       if (!dst)
                return;
 
-       dst->ops->update_pmtu(dst, mtu);
-
        /* Something is about to be wrong... Remember soft error
         * for the case, if this connection will not able to recover.
         */
@@ -335,6 +317,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
        } /* else let the usual retransmit timer handle it */
 }
 
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+       struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+       if (dst)
+               dst->ops->redirect(dst, sk, skb);
+}
+
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -386,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        bh_lock_sock(sk);
        /* If too many ICMPs get dropped on busy
         * servers this needs to be solved differently.
+        * We do take care of PMTU discovery (RFC1191) special case :
+        * we can receive locally generated ICMP messages while socket is held.
         */
-       if (sock_owned_by_user(sk))
+       if (sock_owned_by_user(sk) &&
+           type != ICMP_DEST_UNREACH &&
+           code != ICMP_FRAG_NEEDED)
                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 
        if (sk->sk_state == TCP_CLOSE)
@@ -408,6 +402,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        }
 
        switch (type) {
+       case ICMP_REDIRECT:
+               do_redirect(icmp_skb, sk);
+               goto out;
        case ICMP_SOURCE_QUENCH:
                /* Just silently ignore these. */
                goto out;
@@ -419,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                        goto out;
 
                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+                       tp->mtu_info = info;
                        if (!sock_owned_by_user(sk))
-                               do_pmtu_discovery(sk, iph, info);
+                               tcp_v4_mtu_reduced(sk);
+                       else
+                               set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
                        goto out;
                }
 
@@ -698,8 +698,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 
        net = dev_net(skb_dst(skb)->dev);
        arg.tos = ip_hdr(skb)->tos;
-       ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
-                     &arg, arg.iov[0].iov_len);
+       ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+                             ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -781,8 +781,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
        if (oif)
                arg.bound_dev_if = oif;
        arg.tos = tos;
-       ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
-                     &arg, arg.iov[0].iov_len);
+       ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+                             ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 }
@@ -825,7 +825,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                              struct request_sock *req,
                              struct request_values *rvp,
-                             u16 queue_mapping)
+                             u16 queue_mapping,
+                             bool nocache)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
@@ -848,7 +849,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                err = net_xmit_eval(err);
        }
 
-       dst_release(dst);
        return err;
 }
 
@@ -856,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
                              struct request_values *rvp)
 {
        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-       return tcp_v4_send_synack(sk, NULL, req, rvp, 0);
+       return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 }
 
 /*
@@ -1317,7 +1317,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
-       tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+       tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
        if (tmp_opt.cookie_plus > 0 &&
            tmp_opt.saw_tstamp &&
@@ -1375,7 +1375,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
-               struct inet_peer *peer = NULL;
                struct flowi4 fl4;
 
                /* VJ's idea. We save last timestamp seen
@@ -1390,12 +1389,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                if (tmp_opt.saw_tstamp &&
                    tcp_death_row.sysctl_tw_recycle &&
                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
-                   fl4.daddr == saddr &&
-                   (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
-                       inet_peer_refcheck(peer);
-                       if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
-                           (s32)(peer->tcp_ts - req->ts_recent) >
-                                                       TCP_PAWS_WINDOW) {
+                   fl4.daddr == saddr) {
+                       if (!tcp_peer_is_proven(req, dst, true)) {
                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
                                goto drop_and_release;
                        }
@@ -1404,8 +1399,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                else if (!sysctl_tcp_syncookies &&
                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                          (sysctl_max_syn_backlog >> 2)) &&
-                        (!peer || !peer->tcp_ts_stamp) &&
-                        (!dst || !dst_metric(dst, RTAX_RTT))) {
+                        !tcp_peer_is_proven(req, dst, false)) {
                        /* Without syncookies last quarter of
                         * backlog is filled with destinations,
                         * proven to be alive.
@@ -1425,7 +1419,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
        if (tcp_v4_send_synack(sk, dst, req,
                               (struct request_values *)&tmp_ext,
-                              skb_get_queue_mapping(skb)) ||
+                              skb_get_queue_mapping(skb),
+                              want_cookie) ||
            want_cookie)
                goto drop_and_free;
 
@@ -1622,7 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 #endif
 
        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+               struct dst_entry *dst = sk->sk_rx_dst;
+
                sock_rps_save_rxhash(sk, skb);
+               if (dst) {
+                       if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+                           dst->ops->check(dst, 0) == NULL) {
+                               dst_release(dst);
+                               sk->sk_rx_dst = NULL;
+                       }
+               }
+               if (unlikely(sk->sk_rx_dst == NULL))
+                       inet_sk_rx_dst_set(sk, skb);
+
                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
                        rsk = sk;
                        goto reset;
@@ -1672,6 +1679,44 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+void tcp_v4_early_demux(struct sk_buff *skb)
+{
+       struct net *net = dev_net(skb->dev);
+       const struct iphdr *iph;
+       const struct tcphdr *th;
+       struct sock *sk;
+
+       if (skb->pkt_type != PACKET_HOST)
+               return;
+
+       if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+               return;
+
+       iph = ip_hdr(skb);
+       th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+       if (th->doff < sizeof(struct tcphdr) / 4)
+               return;
+
+       sk = __inet_lookup_established(net, &tcp_hashinfo,
+                                      iph->saddr, th->source,
+                                      iph->daddr, ntohs(th->dest),
+                                      skb->skb_iif);
+       if (sk) {
+               skb->sk = sk;
+               skb->destructor = sock_edemux;
+               if (sk->sk_state != TCP_TIME_WAIT) {
+                       struct dst_entry *dst = sk->sk_rx_dst;
+
+                       if (dst)
+                               dst = dst_check(dst, 0);
+                       if (dst &&
+                           inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+                               skb_dst_set_noref(skb, dst);
+               }
+       }
+}
+
 /*
  *     From tcp_input.c
  */
@@ -1821,40 +1866,10 @@ do_time_wait:
        goto discard_it;
 }
 
-struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
-{
-       struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
-       struct inet_sock *inet = inet_sk(sk);
-       struct inet_peer *peer;
-
-       if (!rt ||
-           inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
-               peer = inet_getpeer_v4(inet->inet_daddr, 1);
-               *release_it = true;
-       } else {
-               if (!rt->peer)
-                       rt_bind_peer(rt, inet->inet_daddr, 1);
-               peer = rt->peer;
-               *release_it = false;
-       }
-
-       return peer;
-}
-EXPORT_SYMBOL(tcp_v4_get_peer);
-
-void *tcp_v4_tw_get_peer(struct sock *sk)
-{
-       const struct inet_timewait_sock *tw = inet_twsk(sk);
-
-       return inet_getpeer_v4(tw->tw_daddr, 1);
-}
-EXPORT_SYMBOL(tcp_v4_tw_get_peer);
-
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
        .twsk_unique    = tcp_twsk_unique,
        .twsk_destructor= tcp_twsk_destructor,
-       .twsk_getpeer   = tcp_v4_tw_get_peer,
 };
 
 const struct inet_connection_sock_af_ops ipv4_specific = {
@@ -1863,7 +1878,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
        .rebuild_header    = inet_sk_rebuild_header,
        .conn_request      = tcp_v4_conn_request,
        .syn_recv_sock     = tcp_v4_syn_recv_sock,
-       .get_peer          = tcp_v4_get_peer,
        .net_header_len    = sizeof(struct iphdr),
        .setsockopt        = ip_setsockopt,
        .getsockopt        = ip_getsockopt,
@@ -1953,6 +1967,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
                tp->cookie_values = NULL;
        }
 
+       /* If socket is aborted during connect operation */
+       tcp_free_fastopen_req(tp);
+
        sk_sockets_allocated_dec(sk);
        sock_release_memcg(sk);
 }
@@ -2365,7 +2382,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 EXPORT_SYMBOL(tcp_proc_unregister);
 
 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
-                        struct seq_file *f, int i, int uid, int *len)
+                        struct seq_file *f, int i, kuid_t uid, int *len)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        int ttd = req->expires - jiffies;
@@ -2382,7 +2399,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
                1,    /* timers active (only the expire timer) */
                jiffies_to_clock_t(ttd),
                req->retrans,
-               uid,
+               from_kuid_munged(seq_user_ns(f), uid),
                0,  /* non standard timer */
                0, /* open_requests have no inode */
                atomic_read(&sk->sk_refcnt),
@@ -2433,7 +2450,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                timer_active,
                jiffies_to_clock_t(timer_expires - jiffies),
                icsk->icsk_retransmits,
-               sock_i_uid(sk),
+               from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
                icsk->icsk_probes_out,
                sock_i_ino(sk),
                atomic_read(&sk->sk_refcnt), sk,
@@ -2593,6 +2610,8 @@ struct proto tcp_prot = {
        .sendmsg                = tcp_sendmsg,
        .sendpage               = tcp_sendpage,
        .backlog_rcv            = tcp_v4_do_rcv,
+       .release_cb             = tcp_release_cb,
+       .mtu_reduced            = tcp_v4_mtu_reduced,
        .hash                   = inet_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
@@ -2614,7 +2633,7 @@ struct proto tcp_prot = {
        .compat_setsockopt      = compat_tcp_setsockopt,
        .compat_getsockopt      = compat_tcp_getsockopt,
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
        .init_cgroup            = tcp_init_cgroup,
        .destroy_cgroup         = tcp_destroy_cgroup,
        .proto_cgroup           = tcp_proto_cgroup,
@@ -2624,13 +2643,11 @@ EXPORT_SYMBOL(tcp_prot);
 
 static int __net_init tcp_sk_init(struct net *net)
 {
-       return inet_ctl_sock_create(&net->ipv4.tcp_sock,
-                                   PF_INET, SOCK_RAW, IPPROTO_TCP, net);
+       return 0;
 }
 
 static void __net_exit tcp_sk_exit(struct net *net)
 {
-       inet_ctl_sock_destroy(net->ipv4.tcp_sock);
 }
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)