[RFT] TCP Westwood+ for 2.6

Stephen Hemminger <shemminger@osdl.org> · Fri, 30 Jan 2004 16:47:47 -0800

Here is the 2.4 version with some cleanups converted to 2.6.
	- use tcp_ prefix (dave)
	- get rid of rwlock not needed (dave)
	- do some hand optimization of the inline's
	- don't make init inline
	- get rid of extra whitespace
	- eliminate accessor for mss_cache

I only did very limited testing, but should be same as original 2.4

diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h

--- a/include/linux/sysctl.h	Fri Jan 30 16:40:41 2004
+++ b/include/linux/sysctl.h	Fri Jan 30 16:40:41 2004
@@ -311,6 +311,7 @@
 	NET_TCP_FRTO=92,
 	NET_TCP_LOW_LATENCY=93,
 	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
+	NET_TCP_WESTWOOD=95,
 };
 
 enum {
diff -Nru a/include/linux/tcp.h b/include/linux/tcp.h
--- a/include/linux/tcp.h	Fri Jan 30 16:40:41 2004
+++ b/include/linux/tcp.h	Fri Jan 30 16:40:41 2004
@@ -374,6 +374,20 @@
 	__u32                   frto_highmark; /* snd_nxt when RTO occurred */
 
 	unsigned long last_synq_overflow; 
+
+/* TCP Westwood structure */
+        struct {
+                __u32    bw_sample;        /* bandwidth sample */
+                __u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
+                __u32    bw_est;           /* bandwidth estimate */
+                __u32    rtt_win_sx;       /* here starts a new evaluation... */
+                __u32    bk;
+                __u32    snd_una;          /* used for evaluating the number of acked bytes */
+                __u32    cumul_ack;
+                __u32    accounted;
+                __u32    rtt;
+                __u32    rtt_min;          /* minimum observed RTT */
+        } westwood;
 };
 
 /* WARNING: don't change the layout of the members in tcp_sock! */
diff -Nru a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h	Fri Jan 30 16:40:41 2004
+++ b/include/net/tcp.h	Fri Jan 30 16:40:41 2004
@@ -579,6 +579,7 @@
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_westwood;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -2019,4 +2020,67 @@
 extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
 extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
 
+/* TCP Westwood functions and constants */
+
+#define TCP_WESTWOOD_INIT_RTT  (20*HZ)           /* maybe too conservative?! */
+#define TCP_WESTWOOD_RTT_MIN   (HZ/20)           /* 50ms */
+
+static inline void tcp_westwood_update_rtt(struct tcp_opt *tp, __u32 rtt_seq)
+{
+        if (sysctl_tcp_westwood)
+                tp->westwood.rtt = rtt_seq;
+}
+
+void __tcp_westwood_fast_bw(struct sock *, struct sk_buff *);
+void __tcp_westwood_slow_bw(struct sock *, struct sk_buff *);
+
+static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+        if (sysctl_tcp_westwood)
+                __tcp_westwood_fast_bw(sk, skb);
+}
+
+static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+        if (sysctl_tcp_westwood)
+                __tcp_westwood_slow_bw(sk, skb);
+}
+
+static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_opt *tp)
+{
+        return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
+		   (__u32) (tp->mss_cache),
+		   2U);
+}
+
+static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_opt *tp)
+{
+	return sysctl_tcp_westwood ? __tcp_westwood_bw_rttmin(tp) : 0;
+}
+
+static inline int tcp_westwood_ssthresh(struct tcp_opt *tp)
+{
+	__u32 ssthresh = 0;
+
+	if (sysctl_tcp_westwood) {
+		ssthresh = __tcp_westwood_bw_rttmin(tp);
+		if (ssthresh)
+			tp->snd_ssthresh = ssthresh;  
+	}
+
+	return (ssthresh != 0);
+}
+
+static inline int tcp_westwood_cwnd(struct tcp_opt *tp)
+{
+	__u32 cwnd = 0;
+
+	if (sysctl_tcp_westwood) {
+		cwnd = __tcp_westwood_bw_rttmin(tp);
+		if (cwnd)
+			tp->snd_cwnd = cwnd;
+	}
+
+	return (cwnd != 0);
+}
 #endif	/* _TCP_H */
diff -Nru a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
--- a/net/ipv4/sysctl_net_ipv4.c	Fri Jan 30 16:40:41 2004
+++ b/net/ipv4/sysctl_net_ipv4.c	Fri Jan 30 16:40:41 2004
@@ -584,6 +584,14 @@
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies
 	},
+	{
+		.ctl_name	= NET_TCP_WESTWOOD, 
+		.procname	= "tcp_westwood",
+		.data		= &sysctl_tcp_westwood,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -Nru a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c	Fri Jan 30 16:40:41 2004
+++ b/net/ipv4/tcp_input.c	Fri Jan 30 16:40:41 2004
@@ -61,6 +61,7 @@
  *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
  *					engine. Lots of bugs are found.
  *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
+ *		Angelo Dell'Aera:	TCP Westwood+ support
  */
 
 #include <linux/config.h>
@@ -89,6 +90,7 @@
 int sysctl_tcp_rfc1337;
 int sysctl_tcp_max_orphans = NR_FILE;
 int sysctl_tcp_frto;
+int sysctl_tcp_westwood;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -474,6 +476,8 @@
 		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
 		tp->rtt_seq = tp->snd_nxt;
 	}
+
+	tcp_westwood_update_rtt(tp, tp->srtt >> 3);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
@@ -981,7 +985,8 @@
             tp->snd_una == tp->high_seq ||
             (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		if (!tcp_westwood_ssthresh(tp))
+			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
 	}
 
 	/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1390,11 +1395,24 @@
 static void tcp_cwnd_down(struct tcp_opt *tp)
 {
 	int decr = tp->snd_cwnd_cnt + 1;
+	__u32 limit;
+
+	/*
+	 * TCP Westwood
+         * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
+	 * in packets we use mss_cache). If CONFIG_TCP_WESTWOOD is not defined
+	 * westwood_bw_rttmin() returns 0. In such case snd_ssthresh is still
+	 * used as usual. It prevents other strange cases in which BWE*RTTmin
+	 * could assume value 0. It should not happen but...
+	 */
+
+	if (!(limit = tcp_westwood_bw_rttmin(tp)))
+		limit = tp->snd_ssthresh/2;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > tp->snd_ssthresh/2)
+	if (decr && tp->snd_cwnd > limit)
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1539,7 +1557,10 @@
 
 static __inline__ void tcp_complete_cwr(struct tcp_opt *tp)
 {
-	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	if (tcp_westwood_cwnd(tp)) 
+		tp->snd_ssthresh = tp->snd_cwnd;
+	else
+		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
@@ -2030,6 +2051,240 @@
 	 */
 	tp->frto_counter = (tp->frto_counter + 1) % 3;
 }
+/*
+ * TCP Westwood
+ * Functions needed for estimating bandwidth.
+ */
+
+/*
+ * This function initializes fields used in TCP Westwood.
+ * We can't get no information about RTT at this time so
+ * we are forced to set it to 0.
+ */
+static void init_westwood(struct sock *sk)
+{
+        struct tcp_opt *tp = tcp_sk(sk);
+
+        tp->westwood.bw_sample = 0;
+        tp->westwood.bw_ns_est = 0;
+        tp->westwood.bw_est = 0;
+        tp->westwood.accounted = 0;
+        tp->westwood.cumul_ack = 0;
+        tp->westwood.rtt_win_sx = tcp_time_stamp;
+        tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
+        tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
+        tp->westwood.snd_una = tp->snd_una;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coeffients.
+ */
+static inline __u32 westwood_do_filter(__u32 a, __u32 b)
+{
+	return (((7 * a) + b) >> 3);
+}
+
+static void westwood_filter(struct sock *sk, __u32 delta)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+	__u32 sample = tp->westwood.bk / delta;
+
+	tp->westwood.bw_ns_est =
+		westwood_do_filter(tp->westwood.bw_ns_est, sample);
+	tp->westwood.bw_est =
+		westwood_do_filter(tp->westwood.bw_est,
+				   tp->westwood.bw_ns_est);
+	tp->westwood.bw_sample = sample;
+}
+
+/* @westwood_update_rttmin
+ * It is used to update RTTmin. In this case we MUST NOT use
+ * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
+ */
+static inline __u32 westwood_update_rttmin(struct sock *sk)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+	__u32 rttmin = tp->westwood.rtt_min;
+
+	if (tp->westwood.rtt == 0)
+		return(rttmin);
+
+	if (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)
+		rttmin = tp->westwood.rtt;
+
+	return(rttmin);
+}
+
+/*
+ * @westwood_acked
+ * Evaluate increases for dk. It requires no lock since when it is
+ * called lock should already be held. Be careful about it!
+ */
+static inline __u32 westwood_acked(struct sock *sk)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+
+	return ((tp->snd_una) - (tp->westwood.snd_una));
+}
+
+/*
+ * @westwood_new_window
+ * It evaluates if we are receiving data inside the same RTT window as
+ * when we started.
+ * Return value:
+ * It returns 0 if we are still evaluating samples in the same RTT
+ * window, 1 if the sample has to be considered in the next window.
+ */
+static int westwood_new_window(struct sock *sk)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+	__u32 left_bound;
+	__u32 rtt;
+	int ret = 0;
+
+	left_bound = tp->westwood.rtt_win_sx;
+	rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
+
+	/*
+	 * A RTT-window has passed. Be careful since if RTT is less than
+	 * 50ms we don't filter but we continue 'building the sample'.
+	 * This minimum limit was choosen since an estimation on small
+	 * time intervals is better to avoid...
+	 * Obvioulsy on a LAN we reasonably will always have
+	 * right_bound = left_bound + WESTWOOD_RTT_MIN
+         */
+
+	if ((left_bound + rtt) < tcp_time_stamp)
+		ret = 1;
+
+	return ret;
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth. Be careful
+ * about __westwood_update_window() since it is called without
+ * any form of lock. It should be used only for internal purposes.
+ * Call westwood_update_window() instead.
+ */
+static void __westwood_update_window(struct sock *sk, __u32 now)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+	__u32 delta = now - tp->westwood.rtt_win_sx;
+
+        if (!delta)
+                return;
+
+	if (tp->westwood.rtt)
+                westwood_filter(sk, delta);
+
+        tp->westwood.bk = 0;
+        tp->westwood.rtt_win_sx = tcp_time_stamp;
+}
+
+
+static void westwood_update_window(struct sock *sk, __u32 now)
+{
+	if (westwood_new_window(sk)) 
+		__westwood_update_window(sk, now);
+}
+
+/*
+ * @__westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successfull. In such case infact update is
+ * straight forward and doesn't need any particular care.
+ */
+void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+
+	westwood_update_window(sk, tcp_time_stamp);
+
+	tp->westwood.bk += westwood_acked(sk);
+	tp->westwood.snd_una = tp->snd_una;
+	tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
+
+
+/*
+ * @tcp_westwood_dupack_update
+ * It updates accounted and cumul_ack when receiving a dupack.
+ */
+
+static void westwood_dupack_update(struct sock *sk)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+
+	tp->westwood.accounted += tp->mss_cache;
+	tp->westwood.cumul_ack = tp->mss_cache;
+}
+
+static inline int westwood_may_change_cumul(struct tcp_opt *tp)
+{
+	return ((tp->westwood.cumul_ack) > tp->mss_cache);
+}
+
+static inline void westwood_partial_update(struct tcp_opt *tp)
+{
+	tp->westwood.accounted -= tp->westwood.cumul_ack;
+	tp->westwood.cumul_ack = tp->mss_cache;
+}
+
+static inline void westwood_complete_update(struct tcp_opt *tp)
+{
+	tp->westwood.cumul_ack -= tp->westwood.accounted;
+	tp->westwood.accounted = 0;
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating dk in case of
+ * delayed or partial acks.
+ */
+static __u32 westwood_acked_count(struct sock *sk)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+
+	tp->westwood.cumul_ack = westwood_acked(sk);
+
+        /* If cumul_ack is 0 this is a dupack since it's not moving
+         * tp->snd_una.
+         */
+        if (!(tp->westwood.cumul_ack))
+                westwood_dupack_update(sk);
+
+        if (westwood_may_change_cumul(tp)) {
+		/* Partial or delayed ack */
+		if ((tp->westwood.accounted) >= (tp->westwood.cumul_ack))
+			westwood_partial_update(tp);
+		else
+			westwood_complete_update(tp);
+	}
+
+	tp->westwood.snd_una = tp->snd_una;
+
+	return tp->westwood.cumul_ack;
+}
+
+
+/*
+ * @__westwood_slow_bw
+ * It is called when something is going wrong..even if there could
+ * be no problems! Infact a simple delayed packet may trigger a
+ * dupack. But we need to be careful in such case.
+ */
+void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_opt *tp = tcp_sk(sk);
+
+	westwood_update_window(sk, tcp_time_stamp);
+
+	tp->westwood.bk += westwood_acked_count(sk);
+	tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
 
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
@@ -2057,6 +2312,7 @@
 		 */
 		tcp_update_wl(tp, ack, ack_seq);
 		tp->snd_una = ack;
+		tcp_westwood_fast_bw(sk, skb);
 		flag |= FLAG_WIN_UPDATE;
 
 		NET_INC_STATS_BH(TCPHPAcks);
@@ -2073,6 +2329,8 @@
 
 		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
 			flag |= FLAG_ECE;
+
+		tcp_westwood_slow_bw(sk,skb);
 	}
 
 	/* We passed data and got it acked, remove any soft error
@@ -3866,6 +4124,8 @@
 			if(tp->af_specific->conn_request(sk, skb) < 0)
 				return 1;
 
+			init_westwood(sk);
+
 			/* Now we have several options: In theory there is 
 			 * nothing else in the frame. KA9Q has an option to 
 			 * send data with the syn, BSD accepts data with the
@@ -3887,6 +4147,8 @@
 		goto discard;
 
 	case TCP_SYN_SENT:
+		init_westwood(sk);
+
 		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
 		if (queued >= 0)
 			return queued;

-
: send the line "unsubscribe linux-net" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html