Dave, I'm attaching the patch which implements Westwood+ support always available and disabled by default. Seems everything works fine here. :-) Just an hint. As you can see, the statistics collecting is implemented in a really bad manner since I use printks to do it. I know it's the worst way to do it but, during our tests, we needed a way to collect a lot of data for evaluating critical variables evolution. Now I don't know if it's correct to leave this feature in implemented in this way or if it's better to completely drop it.. or to do it but in a different manner. Please let me know what you think about it. The patch applies against kernel 2.4.24. Regards. -- Angelo Dell'Aera 'buffer' Antifork Research, Inc. http://buffer.antifork.org diff -Naur linux-2.4.24/Documentation/Configure.help linux-2.4.24-westwood/Documentation/Configure.help --- linux-2.4.24/Documentation/Configure.help 2003-12-02 20:37:30.000000000 +0100 +++ linux-2.4.24-westwood/Documentation/Configure.help 2004-01-17 16:10:30.000000000 +0100 @@ -3568,6 +3568,25 @@ If unsure, say N. +TCP Westwood+ statistics +CONFIG_TCP_WESTWOOD_STATS + TCP Westwood statistics. Say Y if you are interested in + looking at TCP Westwood dynamics. If unsure, say N. + + Please note that TCP Westwood is avalaible always in the current + implementation but isn't enabled by default; you can enable it by + saying Y to "/proc file system support" and "Sysctl support" below + and executing the command + + echo 1 > /proc/sys/net/ipv4/tcp_westwood + + at boot time after the /proc file system has been mounted. + +TCP Westwood+ debug +CONFIG_TCP_WESTWOOD_DEBUG + TCP Westwood+ debug. Unless you are a code developer you + should say N to this option. + # Choice: alphatype Alpha system type CONFIG_ALPHA_GENERIC diff -Naur linux-2.4.24/include/linux/sysctl.h linux-2.4.24-westwood/include/linux/sysctl.h --- linux-2.4.24/include/linux/sysctl.h 2003-12-02 20:49:12.000000000 +0100 +++ linux-2.4.24-westwood/include/linux/sysctl.h 2004-01-16 17:03:43.000000000 +0100 @@ -311,6 +311,7 @@ NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, + NET_TCP_WESTWOOD=95, }; enum { diff -Naur linux-2.4.24/include/net/sock.h linux-2.4.24-westwood/include/net/sock.h --- linux-2.4.24/include/net/sock.h 2004-01-05 18:27:30.000000000 +0100 +++ linux-2.4.24-westwood/include/net/sock.h 2004-01-16 18:29:56.000000000 +0100 @@ -432,6 +432,21 @@ __u32 frto_highmark; /* snd_nxt when RTO occurred */ unsigned long last_synq_overflow; + +/* TCP Westwood structure */ + struct { + __u32 bw_sample; /* bandwidth sample */ + __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + __u32 bw_est; /* bandwidth estimate */ + __u32 rtt_win_sx; /* here starts a new evaluation... */ + __u32 bk; + __u32 snd_una; /* used for evaluating the number of acked bytes */ + __u32 cumul_ack; + __u32 accounted; + __u32 rtt; + __u32 rtt_min; /* minimum observed RTT */ + rwlock_t lock; + } westwood; }; diff -Naur linux-2.4.24/include/net/tcp.h linux-2.4.24-westwood/include/net/tcp.h --- linux-2.4.24/include/net/tcp.h 2004-01-05 18:28:51.000000000 +0100 +++ linux-2.4.24-westwood/include/net/tcp.h 2004-01-16 18:31:20.000000000 +0100 @@ -463,6 +463,7 @@ extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_westwood; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -1863,4 +1864,150 @@ TCP_ADD_STATS_USER(TcpMaxConn, -1); } + +/* TCP Westwood functions and constants */ + +#define WESTWOOD_INIT_RTT 20*HZ /* maybe too conservative?! */ +#define WESTWOOD_RTT_MIN HZ/20 /* 50ms */ + +#ifdef CONFIG_TCP_WESTWOOD_STATS +#define WESTWOOD_STATS(fmt, args...) printk(KERN_INFO fmt, ##args) +#else +#define WESTWOOD_STATS(fmt, args...) do {} while(0) +#endif /* CONFIG_TCP_WESTWOOD_STATS */ + +#ifdef CONFIG_TCP_WESTWOOD_DEBUG +#define WESTWOOD_DEBUG(fmt, args...) printk(KERN_DEBUG fmt, ##args) +#else +#define WESTWOOD_DEBUG(fmt, args...) do {} while(0) +#endif /* CONFIG_TCP_WESTWOOD_DEBUG */ + +static inline void westwood_update_rtt(struct tcp_opt *tp, __u32 rtt_seq) +{ + if (sysctl_tcp_westwood) + tp->westwood.rtt = rtt_seq; +} + +void __westwood_fast_bw(struct sock *, struct sk_buff *); +void __westwood_slow_bw(struct sock *, struct sk_buff *); + +/* + * This function initializes fields used in TCP Westwood. + * We can't get no information about RTT at this time so + * we are forced to set it to 0. + */ + +static inline void __init_westwood(struct sock *sk) +{ + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + tp->westwood.bw_sample = 0; + tp->westwood.bw_ns_est = 0; + tp->westwood.bw_est = 0; + tp->westwood.accounted = 0; + tp->westwood.cumul_ack = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; + tp->westwood.rtt = WESTWOOD_INIT_RTT; + tp->westwood.rtt_min = WESTWOOD_INIT_RTT; + tp->westwood.snd_una = tp->snd_una; + tp->westwood.lock = RW_LOCK_UNLOCKED; +} + +static inline void init_westwood(struct sock *sk) +{ + __init_westwood(sk); +} + +static inline void westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + if (sysctl_tcp_westwood) + __westwood_fast_bw(sk, skb); +} + +static inline void westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + if (sysctl_tcp_westwood) + __westwood_slow_bw(sk, skb); +} + +static inline __u32 __westwood_bw_rttmin(struct tcp_opt *tp) +{ + return (__u32) ((tp->westwood.bw_est) * (tp->westwood.rtt_min) / + (__u32) (tp->mss_cache)); +} + +static inline __u32 westwood_bw_rttmin(struct tcp_opt *tp) +{ + __u32 ret = 0; + + if (sysctl_tcp_westwood) + ret = (__u32) (max(__westwood_bw_rttmin(tp), 2U)); + + return ret; +} + +static inline int westwood_ssthresh(struct tcp_opt *tp) +{ + int ret = 0; + __u32 ssthresh; + + if (sysctl_tcp_westwood) { + + if(!(ssthresh = westwood_bw_rttmin(tp))) + return ret; + + tp->snd_ssthresh = ssthresh; + ret = 1; + } + + return ret; +} + +static inline int westwood_cwnd(struct tcp_opt *tp) +{ + int ret = 0; + __u32 cwnd; + + if (sysctl_tcp_westwood) { + + if(!(cwnd = westwood_bw_rttmin(tp))) + return ret; + + tp->snd_cwnd = cwnd; + ret = 1; + } + + return ret; +} + +static inline int westwood_complete_cwr(struct tcp_opt *tp) +{ + int ret = 0; + + if (sysctl_tcp_westwood) { + + if (westwood_cwnd(tp)) { + tp->snd_ssthresh = tp->snd_cwnd; + ret = 1; + } + } + + return ret; +} + +static inline void westwood_stats(struct tcp_opt *tp) +{ + + if (sysctl_tcp_westwood) { + WESTWOOD_STATS("[westwood] time = %u\n", tcp_time_stamp); + WESTWOOD_STATS("[westwood] cwnd = %u\n", tp->snd_cwnd); + WESTWOOD_STATS("[westwood] ssthresh = %u\n", tp->snd_ssthresh); + WESTWOOD_STATS("[westwood] bandwidth estimation = %u\n", (tp->westwood.bw_est)*HZ); + } else { + WESTWOOD_STATS("[newreno] time = %u\n", tcp_time_stamp); + WESTWOOD_STATS("[newreno] cwnd = %u\n", tp->snd_cwnd); + WESTWOOD_STATS("[newreno] ssthresh = %u\n", tp->snd_ssthresh); + } +} + #endif /* _TCP_H */ diff -Naur linux-2.4.24/net/ipv4/Config.in linux-2.4.24-westwood/net/ipv4/Config.in --- linux-2.4.24/net/ipv4/Config.in 2003-12-02 20:37:55.000000000 +0100 +++ linux-2.4.24-westwood/net/ipv4/Config.in 2004-01-16 16:56:57.000000000 +0100 @@ -40,6 +40,12 @@ fi bool ' IP: TCP Explicit Congestion Notification support' CONFIG_INET_ECN bool ' IP: TCP syncookie support (disabled per default)' CONFIG_SYN_COOKIES +if [ "$CONFIG_PROC_FS" = "y" ]; then + if [ "$CONFIG_SYSCTL" = "y" ]; then + bool ' IP : TCP Westwood+ statistics' CONFIG_TCP_WESTWOOD_STATS + bool ' IP : TCP Westwood+ debug' CONFIG_TCP_WESTWOOD_DEBUG + fi +fi if [ "$CONFIG_NETFILTER" != "n" ]; then source net/ipv4/netfilter/Config.in fi diff -Naur linux-2.4.24/net/ipv4/sysctl_net_ipv4.c linux-2.4.24-westwood/net/ipv4/sysctl_net_ipv4.c --- linux-2.4.24/net/ipv4/sysctl_net_ipv4.c 2003-06-13 16:51:39.000000000 +0200 +++ linux-2.4.24-westwood/net/ipv4/sysctl_net_ipv4.c 2004-01-16 18:49:37.000000000 +0100 @@ -52,6 +52,9 @@ static int ip_local_port_range_max[] = { 65535, 65535 }; #endif +/* From tcp_input.c */ +extern int sysctl_tcp_westwood; + struct ipv4_config ipv4_config; extern ctl_table ipv4_route_table[]; @@ -229,6 +232,9 @@ {NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval", &sysctl_ipfrag_secret_interval, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_TCP_WESTWOOD, "tcp_westwood", + &sysctl_tcp_westwood, sizeof(int), 0644, NULL, + &proc_dointvec}, {0} }; diff -Naur linux-2.4.24/net/ipv4/tcp_input.c linux-2.4.24-westwood/net/ipv4/tcp_input.c --- linux-2.4.24/net/ipv4/tcp_input.c 2003-12-02 20:37:57.000000000 +0100 +++ linux-2.4.24-westwood/net/ipv4/tcp_input.c 2004-01-17 16:13:34.000000000 +0100 @@ -61,6 +61,7 @@ * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs + * Angelo Dell'Aera: TCP Westwood+ support */ #include <linux/config.h> @@ -89,6 +90,8 @@ int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto = 0; +int sysctl_tcp_westwood = 0; + #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ @@ -470,6 +473,8 @@ tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } + + westwood_update_rtt(tp, tp->srtt >> 3); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1068,7 +1073,9 @@ tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + + if (!(westwood_ssthresh(tp))) + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1380,11 +1387,24 @@ static void tcp_cwnd_down(struct tcp_opt *tp) { int decr = tp->snd_cwnd_cnt + 1; + __u32 limit; + + /* + * TCP Westwood + * Here limit is evaluated as BWestimation*RTTmin (for obtaining it + * in packets we use mss_cache). If CONFIG_TCP_WESTWOOD is not defined + * westwood_bw_rttmin() returns 0. In such case snd_ssthresh is still + * used as usual. It prevents other strange cases in which BWE*RTTmin + * could assume value 0. It should not happen but... + */ + + if (!(limit = westwood_bw_rttmin(tp))) + limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > tp->snd_ssthresh/2) + if (decr && tp->snd_cwnd > limit) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1528,7 +1548,8 @@ static __inline__ void tcp_complete_cwr(struct tcp_opt *tp) { - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + if (!(westwood_complete_cwr(tp))) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2016,6 +2037,260 @@ tp->frto_counter = (tp->frto_counter + 1) % 3; } +/* + * TCP Westwood + * Functions needed for estimating bandwidth. + */ + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coeffients. + */ + +static inline __u32 westwood_do_filter(__u32 a, __u32 b) +{ + return( (7*a + b) >> 3); +} + +static void westwood_filter(struct sock *sk, __u32 delta) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + __u32 sample = (tp->westwood.bk) / delta; + + tp->westwood.bw_ns_est = westwood_do_filter(tp->westwood.bw_ns_est, sample); + tp->westwood.bw_est = westwood_do_filter(tp->westwood.bw_est, tp->westwood.bw_ns_est); + tp->westwood.bw_sample = sample; +} + +/* @westwood_update_rttmin + * It is used to update RTTmin. In this case we MUST NOT use + * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! + */ + +static inline __u32 westwood_update_rttmin(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 rttmin = tp->westwood.rtt_min; + + if (tp->westwood.rtt == 0) + return(rttmin); + + if (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin) + rttmin = tp->westwood.rtt; + + return(rttmin); +} + +/* + * @westwood_acked + * Evaluate increases for dk. It requires no lock since when it is + * called lock should already be held. Be careful about it! + */ + +static __u32 westwood_acked(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + return ((tp->snd_una) - (tp->westwood.snd_una)); +} + +/* + * @westwood_new_window + * It evaluates if we are receiving data inside the same RTT window as + * when we started. + * Return value: + * It returns 0 if we are still evaluating samples in the same RTT + * window, 1 if the sample has to be considered in the next window. + */ + +static int westwood_new_window(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 left_bound; + __u32 rtt; + int ret = 0; + + read_lock(&tp->westwood.lock); + left_bound = tp->westwood.rtt_win_sx; + rtt = max(tp->westwood.rtt, (__u32)WESTWOOD_RTT_MIN); + read_unlock(&tp->westwood.lock); + + /* + * A RTT-window has passed. Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was choosen since an estimation on small + * time intervals is better to avoid... + * Obvioulsy on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + + if ( (left_bound + rtt) < tcp_time_stamp) + ret = 1; + + return ret; +} + + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. Be careful + * about __westwood_update_window() since it is called without + * any form of lock. It should be used only for internal purposes. + * Call westwood_update_window() instead. + */ + + +static void __westwood_update_window(struct sock *sk, __u32 now) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 delta = now - tp->westwood.rtt_win_sx; + + if (!delta) + return; + + if (tp->westwood.rtt) + westwood_filter(sk, delta); + + tp->westwood.bk = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; +} + + +static void westwood_update_window(struct sock *sk, __u32 now) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if(westwood_new_window(sk)) { + write_lock(&tp->westwood.lock); + __westwood_update_window(sk, now); + write_unlock(&tp->westwood.lock); + } +} + +/* + * @__westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successfull. In such case infact update is + * straight forward and doesn't need any particular care. + */ + +void __westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + westwood_update_window(sk, tcp_time_stamp); + + write_lock(&tp->westwood.lock); + tp->westwood.bk += westwood_acked(sk); + tp->westwood.snd_una = tp->snd_una; + tp->westwood.rtt_min = westwood_update_rttmin(sk); + write_unlock(&tp->westwood.lock); +} + +/* + * @westwood_mss + * This function was inserted just to have the possibility to evaluate + * which value of MSS is better. Infact we can use neither mss_cache or + * mss_cache. Just testing we will know it! + */ + +static inline __u32 westwood_mss(struct tcp_opt *tp) +{ + return ((__u32)(tp->mss_cache)); +} + + +/* + * @tcp_westwood_dupack_update + * It updates accounted and cumul_ack when receiving a dupack. + */ + +static void westwood_dupack_update(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + write_lock(&tp->westwood.lock); + tp->westwood.accounted += westwood_mss(tp); + tp->westwood.cumul_ack = westwood_mss(tp); + write_unlock(&tp->westwood.lock); +} + + +static inline int westwood_may_change_cumul(struct tcp_opt *tp) +{ + return ((tp->westwood.cumul_ack) > westwood_mss(tp)); +} + + +static inline void westwood_partial_update(struct tcp_opt *tp) +{ + tp->westwood.accounted -= tp->westwood.cumul_ack; + tp->westwood.cumul_ack = westwood_mss(tp); +} + + +static inline void westwood_complete_update(struct tcp_opt *tp) +{ + tp->westwood.cumul_ack -= tp->westwood.accounted; + tp->westwood.accounted = 0; +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating dk in case of + * delayed or partial acks. + */ + +static __u32 westwood_acked_count(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tp->westwood.cumul_ack = westwood_acked(sk); + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + + if (!(tp->westwood.cumul_ack)) + westwood_dupack_update(sk); + + if (westwood_may_change_cumul(tp)) { + /* Partial or delayed ack */ + if ((tp->westwood.accounted) >= (tp->westwood.cumul_ack)) + westwood_partial_update(tp); + else + westwood_complete_update(tp); + } + + tp->westwood.snd_una = tp->snd_una; + + return(tp->westwood.cumul_ack); +} + + +/* + * @__westwood_slow_bw + * It is called when something is going wrong..even if there could + * be no problems! Infact a simple delayed packet may trigger a + * dupack. But we need to be careful in such case. + */ + +void __westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + westwood_update_window(sk, tcp_time_stamp); + + write_lock(&tp->westwood.lock); + tp->westwood.bk += westwood_acked_count(sk); + tp->westwood.rtt_min = westwood_update_rttmin(sk); + write_unlock(&tp->westwood.lock); +} + +/* TCP Westwood routines end here */ + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { @@ -2042,6 +2317,7 @@ */ tcp_update_wl(tp, ack, ack_seq); tp->snd_una = ack; + westwood_fast_bw(sk, skb); flag |= FLAG_WIN_UPDATE; NET_INC_STATS_BH(TCPHPAcks); @@ -2058,8 +2334,12 @@ if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; + + westwood_slow_bw(sk, skb); } + westwood_stats(tp); + /* We passed data and got it acked, remove any soft error * log. Something worked... */ @@ -3796,7 +4076,6 @@ return 1; } - /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. @@ -3827,6 +4106,8 @@ if(tp->af_specific->conn_request(sk, skb) < 0) return 1; + init_westwood(sk); + /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the @@ -3848,6 +4129,9 @@ goto discard; case TCP_SYN_SENT: + + init_westwood(sk); + queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; - : send the line "unsubscribe linux-net" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html