Re: [PATCH v4] sctp: Implement quick failover draft from tsvwg

Vlad Yasevich <vyasevich@xxxxxxxxx> · Fri, 20 Jul 2012 13:55:35 -0400

On 07/20/2012 01:19 PM, Neil Horman wrote:
I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.

Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05

This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.

Signed-off-by: Neil Horman <nhorman@xxxxxxxxxxxxx>
CC: Vlad Yasevich <vyasevich@xxxxxxxxx>
CC: Sridhar Samudrala <sri@xxxxxxxxxx>
CC: "David S. Miller" <davem@xxxxxxxxxxxxx>
CC: linux-sctp@xxxxxxxxxxxxxxx
CC: joe@xxxxxxxxxxx

---
Change notes:

V2)
- Added socket option API from section 6.1 of the specification, as per
request from Vlad. Adding this socket option allows us to alter both the path
maximum retransmit value and the path partial failure threshold for each
transport and the association as a whole.

- Added a per transport pf_retrans value, and initialized it from the
association value.  This makes each transport independently configurable as per
the socket option above, and prevents changes in the sysctl from bleeding into
an already created association.

V3)
- Cleaned up some line spacing (Joe Perches)
- Fixed some socket option user data sanitization (Vlad Yasevich)

V4)
- Added additional documentation (Flavio Leitner)
---
  Documentation/networking/ip-sysctl.txt |   14 +++++
  include/net/sctp/constants.h           |    1 +
  include/net/sctp/structs.h             |   20 ++++++-
  include/net/sctp/user.h                |   11 ++++
  net/sctp/associola.c                   |   37 ++++++++++--
  net/sctp/outqueue.c                    |    6 +-
  net/sctp/sm_sideeffect.c               |   33 +++++++++-
  net/sctp/socket.c                      |  100 ++++++++++++++++++++++++++++++++
  net/sctp/sysctl.c                      |    9 +++
  net/sctp/transport.c                   |    4 +-
  10 files changed, 220 insertions(+), 15 deletions(-)


[ snip ]

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..fef9bfa 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3470,6 +3470,56 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
  }


+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+			   sizeof(struct sctp_paddrthlds)))
+		return -EFAULT;
+
+	/* path_max_retrans shouldn't ever be zero */
+	if (!val.spt_pathmaxrxt)
+		return -EINVAL;

I am not sure I like this solution.  This means that the application 
must fetch the pathmaxrx and then write the same value back here.
Why not simply ignore the patthmaxrxt if it's 0?  That way someone can 
just tweak the pf value without changing the pathmaxrxt.



+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				    transports) {
+			trans->pathmaxrxt = val.spt_pathmaxrxt;
+			trans->pf_retrans = val.spt_pathpfthld;
+		}
+
+		asoc->pf_retrans = val.spt_pathpfthld;
+		asoc->pathmaxrxt = val.spt_pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		trans->pathmaxrxt = val.spt_pathmaxrxt;
+		trans->pf_retrans = val.spt_pathpfthld;
+	}
+
+	return 0;
+}
+
  /* API 6.2 setsockopt(), getsockopt()
   *
   * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3619,6 +3669,9 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
  	case SCTP_AUTO_ASCONF:
  		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
  		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		break;
  	default:
  		retval = -ENOPROTOOPT;
  		break;
@@ -5490,6 +5543,50 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
  	return 0;
  }

+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+					    char __user *optval,
+					    int optlen)
+{
+	struct sctp_paddrthlds val;
+	struct sctp_transport *trans;
+	struct sctp_association *asoc;
+
+	if (optlen < sizeof(struct sctp_paddrthlds))
+		return -EINVAL;
+	optlen = sizeof(struct sctp_paddrthlds);
+	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, optlen))
+		return -EFAULT;
+
+	if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+		asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+		if (!asoc)
+			return -ENOENT;
+
+		val.spt_pathpfthld = asoc->pf_retrans;
+		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+	} else {
+		trans = sctp_addr_id2transport(sk, &val.spt_address,
+					       val.spt_assoc_id);
+		if (!trans)
+			return -ENOENT;
+
+		val.spt_pathmaxrxt = trans->pathmaxrxt;
+		val.spt_pathpfthld = trans->pf_retrans;
+	}
+
+	if (copy_to_user(optval, &val, optlen))
+		return -EFAULT;
+

getsockopt typically returns the length of the option data that was 
written to the user.

+	return 0;
+}
+
  SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
  				char __user *optval, int __user *optlen)
  {
@@ -5628,6 +5725,9 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
  	case SCTP_AUTO_ASCONF:
  		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
  		break;
+	case SCTP_PEER_ADDR_THLDS:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len);
+		break;

You are passing the len.  The user may have passed in a bigger buffer 
and is expecting back the length of the option.

-vlad

  	default:
  		retval = -ENOPROTOOPT;
  		break;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index e5fe639..2b2bfe9 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -141,6 +141,15 @@ static ctl_table sctp_table[] = {
  		.extra2		= &int_max
  	},
  	{
+		.procname	= "pf_retrans",
+		.data		= &sctp_pf_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
  		.procname	= "max_init_retransmits",
  		.data		= &sctp_max_retrans_init,
  		.maxlen		= sizeof(int),
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index b026ba0..194d0f3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -85,6 +85,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,

  	/* Initialize the default path max_retrans.  */
  	peer->pathmaxrxt  = sctp_max_retrans_path;
+	peer->pf_retrans  = sctp_pf_retrans;

  	INIT_LIST_HEAD(&peer->transmitted);
  	INIT_LIST_HEAD(&peer->send_ready);
@@ -585,7 +586,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *t)
  {
  	unsigned long timeout;
  	timeout = t->rto + sctp_jitter(t->rto);
-	if (t->state != SCTP_UNCONFIRMED)
+	if ((t->state != SCTP_UNCONFIRMED) &&
+	    (t->state != SCTP_PF))
  		timeout += t->hbinterval;
  	timeout += jiffies;
  	return timeout;



--
To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html