This patch makes selection of congestion control algorithm simpler by
using a single sysctl for that purpose, rather than a cascade of sysctls.
The patch also does some minor cleanups to avoid cascade actions between
algorithms so that flow control is cleaner.
Possible improvements:
- Use a string when reading/writing from sysctl to make it more
friendly to humans.
- And/Or, provide a list of all available congestion control
algorithms.
The patch is against 2.6.11-rc4-bk9.
Signed-Off-By: Yee-Ting Li <yee-ting.li@xxxxxxx>
Signed-Off-By: Baruch Even <baruch@xxxxxxxxx>
This patch makes selection of congestion control algorithm simpler by using a
single sysctl for that purpose, rather than a cascade of sysctls.
The patch also does some minor cleanups to avoid cascade actions between
algorithms so that flow control is cleaner.
Possible improvements:
- Use a string when reading/writing from sysctl to make it more friendly to humans
- And/Or, provide a list of all available congestion control algorithms
The patch is against 2.6.11-rc4-bk9.
Signed-Off-By: Yee-Ting Li <yee-ting.li@xxxxxxx>
Signed-Off-By: Baruch Even <baruch@xxxxxxxxx>
Index: 2.6.11-select/include/linux/sysctl.h
===================================================================
--- 2.6.11-select.orig/include/linux/sysctl.h
+++ 2.6.11-select/include/linux/sysctl.h
@@ -344,6 +344,7 @@ enum
NET_TCP_DEFAULT_WIN_SCALE=105,
NET_TCP_MODERATE_RCVBUF=106,
NET_TCP_TSO_WIN_DIVISOR=107,
+ NET_TCP_ADV_CONG=108,
};
enum {
Index: 2.6.11-select/include/net/tcp.h
===================================================================
--- 2.6.11-select.orig/include/net/tcp.h
+++ 2.6.11-select/include/net/tcp.h
@@ -597,13 +597,11 @@ extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_tw_reuse;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_low_latency;
-extern int sysctl_tcp_westwood;
-extern int sysctl_tcp_vegas_cong_avoid;
extern int sysctl_tcp_vegas_alpha;
extern int sysctl_tcp_vegas_beta;
extern int sysctl_tcp_vegas_gamma;
extern int sysctl_tcp_nometrics_save;
-extern int sysctl_tcp_bic;
+extern int sysctl_tcp_adv_cong;
extern int sysctl_tcp_bic_fast_convergence;
extern int sysctl_tcp_bic_low_window;
extern int sysctl_tcp_moderate_rcvbuf;
@@ -1241,7 +1239,8 @@ static __inline__ unsigned int tcp_packe
*/
static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
{
- if (tcp_is_bic(tp)) {
+ switch (tp->adv_cong) {
+ case TCP_BIC:
if (sysctl_tcp_bic_fast_convergence &&
tp->snd_cwnd < tp->bictcp.last_max_cwnd)
tp->bictcp.last_max_cwnd
@@ -1253,9 +1252,11 @@ static inline __u32 tcp_recalc_ssthresh(
if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
return max(tp->snd_cwnd - (tp->snd_cwnd/BICTCP_1_OVER_BETA),
2U);
- }
+ break;
- return max(tp->snd_cwnd >> 1U, 2U);
+ default:
+ return max(tp->snd_cwnd >> 1U, 2U);
+ }
}
/* Stop taking Vegas samples for now. */
@@ -1980,24 +1981,19 @@ static inline void tcp_westwood_update_r
tp->westwood.rtt = rtt_seq;
}
-static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
+static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
{
return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
(__u32) (tp->mss_cache_std),
2U);
}
-static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
-{
- return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
-}
-
static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
{
__u32 ssthresh = 0;
if (tcp_is_westwood(tp)) {
- ssthresh = __tcp_westwood_bw_rttmin(tp);
+ ssthresh = tcp_westwood_bw_rttmin(tp);
if (ssthresh)
tp->snd_ssthresh = ssthresh;
}
@@ -2010,7 +2006,7 @@ static inline int tcp_westwood_cwnd(stru
__u32 cwnd = 0;
if (tcp_is_westwood(tp)) {
- cwnd = __tcp_westwood_bw_rttmin(tp);
+ cwnd = tcp_westwood_bw_rttmin(tp);
if (cwnd)
tp->snd_cwnd = cwnd;
}
Index: 2.6.11-select/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- 2.6.11-select.orig/net/ipv4/sysctl_net_ipv4.c
+++ 2.6.11-select/net/ipv4/sysctl_net_ipv4.c
@@ -602,22 +602,14 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
- {
- .ctl_name = NET_TCP_WESTWOOD,
- .procname = "tcp_westwood",
- .data = &sysctl_tcp_westwood,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS,
- .procname = "tcp_vegas_cong_avoid",
- .data = &sysctl_tcp_vegas_cong_avoid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
+ {
+ .ctl_name = NET_TCP_ADV_CONG,
+ .procname = "tcp_adv_cong",
+ .data = &sysctl_tcp_adv_cong,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{
.ctl_name = NET_TCP_VEGAS_ALPHA,
.procname = "tcp_vegas_alpha",
@@ -643,14 +635,6 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec,
},
{
- .ctl_name = NET_TCP_BIC,
- .procname = "tcp_bic",
- .data = &sysctl_tcp_bic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
.ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
.procname = "tcp_bic_fast_convergence",
.data = &sysctl_tcp_bic_fast_convergence,
Index: 2.6.11-select/net/ipv4/tcp_input.c
===================================================================
--- 2.6.11-select.orig/net/ipv4/tcp_input.c
+++ 2.6.11-select/net/ipv4/tcp_input.c
@@ -87,8 +87,6 @@ int sysctl_tcp_rfc1337;
int sysctl_tcp_max_orphans = NR_FILE;
int sysctl_tcp_frto;
int sysctl_tcp_nometrics_save;
-int sysctl_tcp_westwood;
-int sysctl_tcp_vegas_cong_avoid;
int sysctl_tcp_moderate_rcvbuf = 1;
@@ -99,10 +97,11 @@ int sysctl_tcp_moderate_rcvbuf = 1;
int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_bic = 1;
int sysctl_tcp_bic_fast_convergence = 1;
int sysctl_tcp_bic_low_window = 14;
+int sysctl_tcp_adv_cong;
+
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -561,15 +560,18 @@ static void tcp_event_data_recv(struct s
*/
void tcp_ca_init(struct tcp_sock *tp)
{
- if (sysctl_tcp_westwood)
- tp->adv_cong = TCP_WESTWOOD;
- else if (sysctl_tcp_bic)
- tp->adv_cong = TCP_BIC;
- else if (sysctl_tcp_vegas_cong_avoid) {
- tp->adv_cong = TCP_VEGAS;
- tp->vegas.baseRTT = 0x7fffffff;
- tcp_vegas_enable(tp);
- }
+ switch (sysctl_tcp_adv_cong) {
+ case TCP_VEGAS:
+ tp->vegas.baseRTT = 0x7fffffff;
+ tcp_vegas_enable(tp);
+ /* Fallthrough */
+ case TCP_BIC:
+ case TCP_WESTWOOD:
+ tp->adv_cong = sysctl_tcp_adv_cong;
+ break;
+ default:
+ tp->adv_cong = TCP_RENO;
+ }
}
/* Do RTT sampling needed for Vegas.
@@ -1600,18 +1602,25 @@ static void tcp_cwnd_down(struct tcp_soc
int decr = tp->snd_cwnd_cnt + 1;
__u32 limit;
- /*
- * TCP Westwood
- * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
- * in packets we use mss_cache). If sysctl_tcp_westwood is off
- * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
- * still used as usual. It prevents other strange cases in which
- * BWE*RTTmin could assume value 0. It should not happen but...
- */
+ switch (tp->adv_cong) {
+ case TCP_WESTWOOD:
+ /*
+ * TCP Westwood
+ * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). The guard is against
+ * strange cases in which BWE*RTTmin could assume value
+ * 0. It should not happen but...
+ */
- if (!(limit = tcp_westwood_bw_rttmin(tp)))
- limit = tp->snd_ssthresh/2;
+ if (!(limit = tcp_westwood_bw_rttmin(tp)))
+ limit = tp->snd_ssthresh/2;
+ break;
+ default:
+ limit = tp->snd_ssthresh/2;
+ break;
+ }
+
tp->snd_cwnd_cnt = decr&1;
decr >>= 1;
@@ -2014,6 +2023,27 @@ static inline void tcp_ack_update_rtt(st
tcp_ack_no_tstamp(tp, seq_rtt, flag);
}
+static inline void tcp_slow_start(struct tcp_sock *tp)
+{
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+}
+
+static inline void tcp_increase_cwnd(struct tcp_sock *tp, __u32 window)
+{
+ /* In dangerous area, increase slowly.
+ * In theory, for standard tcp, this is tp->snd_cwnd += 1 / window
+ * (snd_cwnd for Reno)
+ */
+ if (tp->snd_cwnd_cnt >= window) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+}
+
/*
* Compute congestion window to use.
*
@@ -2029,10 +2059,6 @@ static inline void tcp_ack_update_rtt(st
*/
static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
{
- /* orignal Reno behaviour */
- if (!tcp_is_bic(tp))
- return tp->snd_cwnd;
-
if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
(s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
return tp->bictcp.cnt;
@@ -2080,23 +2106,13 @@ static inline __u32 bictcp_cwnd(struct t
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-static inline void reno_cong_avoid(struct tcp_sock *tp)
+static inline void reno_cong_avoid(struct tcp_sock *tp, u32 snd_cwnd)
{
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt=0;
- } else
- tp->snd_cwnd_cnt++;
- }
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else
+ tcp_increase_cwnd(tp, snd_cwnd);
+
tp->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -2324,10 +2340,22 @@ static void vegas_cong_avoid(struct tcp_
static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
{
- if (tcp_vegas_enabled(tp))
- vegas_cong_avoid(tp, ack, seq_rtt);
- else
- reno_cong_avoid(tp);
+ if (tp->snd_cwnd >= tp->snd_cwnd_clamp)
+ return;
+
+ switch (sysctl_tcp_adv_cong) {
+ case TCP_VEGAS:
+ vegas_cong_avoid(tp, ack, seq_rtt);
+ break;
+
+ case TCP_BIC:
+ reno_cong_avoid(tp, bictcp_cwnd(tp));
+ break;
+
+ default:
+ reno_cong_avoid(tp, tp->snd_cwnd);
+ break;
+ }
}
/* Restart timer after forward progress on connection.