]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - net/ipv4/tcp_input.c
Merge branch 'master' into tk71
[mv-sheeva.git] / net / ipv4 / tcp_input.c
index b55f60f6fcbe934c1364ee3aece309dff4d1be4b..65f6c04062453aefdffa2317781921990616e796 100644 (file)
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
                icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 }
 
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        tcp_incr_quickack(sk);
@@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
        int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
                     sizeof(struct sk_buff);
 
-       if (sk->sk_sndbuf < 3 * sndmem)
-               sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+       if (sk->sk_sndbuf < 3 * sndmem) {
+               sk->sk_sndbuf = 3 * sndmem;
+               if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
+                       sk->sk_sndbuf = sysctl_tcp_wmem[2];
+       }
 }
 
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
        if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
            !tcp_memory_pressure &&
-           atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+           atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
                sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
                                    sysctl_tcp_rmem[2]);
        }
@@ -428,10 +431,10 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
  *
  * The algorithm for RTT estimation w/o timestamps is based on
  * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
  *
  * More detail on this code can be found at
- * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * <http://staff.psc.edu/jheffner/>,
  * though this reference is out of date.  A new paper
  * is pending.
  */
@@ -731,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
                         * Reset our results.
                         */
                        if (!(dst_metric_locked(dst, RTAX_RTT)))
-                               dst->metrics[RTAX_RTT - 1] = 0;
+                               dst_metric_set(dst, RTAX_RTT, 0);
                        return;
                }
 
@@ -773,57 +776,48 @@ void tcp_update_metrics(struct sock *sk)
                        if (dst_metric(dst, RTAX_SSTHRESH) &&
                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
                            (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
-                               dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
+                               dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
                        if (!dst_metric_locked(dst, RTAX_CWND) &&
                            tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
-                               dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd;
+                               dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
                } else if (tp->snd_cwnd > tp->snd_ssthresh &&
                           icsk->icsk_ca_state == TCP_CA_Open) {
                        /* Cong. avoidance phase, cwnd is reliable. */
                        if (!dst_metric_locked(dst, RTAX_SSTHRESH))
-                               dst->metrics[RTAX_SSTHRESH-1] =
-                                       max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
+                               dst_metric_set(dst, RTAX_SSTHRESH,
+                                              max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
                        if (!dst_metric_locked(dst, RTAX_CWND))
-                               dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;
+                               dst_metric_set(dst, RTAX_CWND,
+                                              (dst_metric(dst, RTAX_CWND) +
+                                               tp->snd_cwnd) >> 1);
                } else {
                        /* Else slow start did not finish, cwnd is non-sense,
                           ssthresh may be also invalid.
                         */
                        if (!dst_metric_locked(dst, RTAX_CWND))
-                               dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1;
+                               dst_metric_set(dst, RTAX_CWND,
+                                              (dst_metric(dst, RTAX_CWND) +
+                                               tp->snd_ssthresh) >> 1);
                        if (dst_metric(dst, RTAX_SSTHRESH) &&
                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
                            tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
-                               dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+                               dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
                }
 
                if (!dst_metric_locked(dst, RTAX_REORDERING)) {
                        if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
                            tp->reordering != sysctl_tcp_reordering)
-                               dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+                               dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
                }
        }
 }
 
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- *     The RFC specifies a window of no more than 4380 bytes
- *     unless 2*MSS > 4380.  Reading the pseudocode in the RFC
- *     is a bit misleading because they use a clamp at 4380 bytes
- *     rather than use a multiplier in the relevant range.
- */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
-       if (!cwnd) {
-               if (tp->mss_cache > 1460)
-                       cwnd = 2;
-               else
-                       cwnd = (tp->mss_cache > 1095) ? 3 : 4;
-       }
+       if (!cwnd)
+               cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
@@ -922,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
        }
        tcp_set_rto(sk);
-       if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
-               goto reset;
-
-cwnd:
-       tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-       tp->snd_cwnd_stamp = tcp_time_stamp;
-       return;
-
+       if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
 reset:
-       /* Play conservative. If timestamps are not
-        * supported, TCP will fail to recalculate correct
-        * rtt, if initial rto is too small. FORGET ALL AND RESET!
-        */
-       if (!tp->rx_opt.saw_tstamp && tp->srtt) {
-               tp->srtt = 0;
-               tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
-               inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+               /* Play conservative. If timestamps are not
+                * supported, TCP will fail to recalculate correct
+                * rtt, if initial rto is too small. FORGET ALL AND RESET!
+                */
+               if (!tp->rx_opt.saw_tstamp && tp->srtt) {
+                       tp->srtt = 0;
+                       tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
+                       inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+               }
        }
-       goto cwnd;
+       tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+       tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
 static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -1233,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
        }
 
        /* D-SACK for already forgotten data... Do dumb counting. */
-       if (dup_sack &&
+       if (dup_sack && tp->undo_marker && tp->undo_retrans &&
            !after(end_seq_0, prior_snd_una) &&
            after(end_seq_0, tp->undo_marker))
                tp->undo_retrans--;
@@ -1310,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 
        /* Account D-SACK for retransmitted packet. */
        if (dup_sack && (sacked & TCPCB_RETRANS)) {
-               if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+               if (tp->undo_marker && tp->undo_retrans &&
+                   after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
                        tp->undo_retrans--;
                if (sacked & TCPCB_SACKED_ACKED)
                        state->reord = min(fack_count, state->reord);
@@ -2314,7 +2304,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
 
 static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 {
-       return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+       return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
 }
 
 static inline int tcp_head_timedout(struct sock *sk)
@@ -2508,7 +2498,7 @@ static void tcp_timeout_skbs(struct sock *sk)
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
  * is against sacked "cnt", otherwise it's against facked "cnt"
  */
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -2516,13 +2506,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
        int err;
        unsigned int mss;
 
-       if (packets == 0)
-               return;
-
        WARN_ON(packets > tp->packets_out);
        if (tp->lost_skb_hint) {
                skb = tp->lost_skb_hint;
                cnt = tp->lost_cnt_hint;
+               /* Head already handled? */
+               if (mark_head && skb != tcp_write_queue_head(sk))
+                       return;
        } else {
                skb = tcp_write_queue_head(sk);
                cnt = 0;
@@ -2557,6 +2547,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
                }
 
                tcp_skb_mark_lost(tp, skb);
+
+               if (mark_head)
+                       break;
        }
        tcp_verify_left_out(tp);
 }
@@ -2568,17 +2561,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
        struct tcp_sock *tp = tcp_sk(sk);
 
        if (tcp_is_reno(tp)) {
-               tcp_mark_head_lost(sk, 1);
+               tcp_mark_head_lost(sk, 1, 1);
        } else if (tcp_is_fack(tp)) {
                int lost = tp->fackets_out - tp->reordering;
                if (lost <= 0)
                        lost = 1;
-               tcp_mark_head_lost(sk, lost);
+               tcp_mark_head_lost(sk, lost, 0);
        } else {
                int sacked_upto = tp->sacked_out - tp->reordering;
-               if (sacked_upto < fast_rexmit)
-                       sacked_upto = fast_rexmit;
-               tcp_mark_head_lost(sk, sacked_upto);
+               if (sacked_upto >= 0)
+                       tcp_mark_head_lost(sk, sacked_upto, 0);
+               else if (fast_rexmit)
+                       tcp_mark_head_lost(sk, 1, 1);
        }
 
        tcp_timeout_skbs(sk);
@@ -2887,7 +2881,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
                       icsk->icsk_mtup.probe_size;
        tp->snd_cwnd_cnt = 0;
        tp->snd_cwnd_stamp = tcp_time_stamp;
-       tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+       tp->snd_ssthresh = tcp_current_ssthresh(sk);
 
        icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
        icsk->icsk_mtup.probe_size = 0;
@@ -2984,7 +2978,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
            before(tp->snd_una, tp->high_seq) &&
            icsk->icsk_ca_state != TCP_CA_Open &&
            tp->fackets_out > tp->reordering) {
-               tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+               tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
        }
 
@@ -3412,8 +3406,8 @@ static void tcp_ack_probe(struct sock *sk)
 
 static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
-       return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
-               inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+       return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+               inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
 
 static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3430,9 +3424,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
                                        const u32 ack, const u32 ack_seq,
                                        const u32 nwin)
 {
-       return (after(ack, tp->snd_una) ||
+       return  after(ack, tp->snd_una) ||
                after(ack_seq, tp->snd_wl1) ||
-               (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+               (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
 }
 
 /* Update our send window.
@@ -4406,7 +4400,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
                        if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
                                tp->ucopy.len -= chunk;
                                tp->copied_seq += chunk;
-                               eaten = (chunk == skb->len && !th->fin);
+                               eaten = (chunk == skb->len);
                                tcp_rcv_space_adjust(sk);
                        }
                        local_bh_disable();
@@ -4870,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
                return 0;
 
        /* If we are under soft global TCP memory pressure, do not expand.  */
-       if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+       if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
                return 0;
 
        /* If we filled the congestion window, do not expand.  */