> -----Original Message----- > From: Florian Westphal <fw@xxxxxxxxx> > Sent: Wednesday, 4 January 2023 13:41 > To: Sriram Yagnaraman <sriram.yagnaraman@xxxxxxxx> > Cc: netfilter-devel@xxxxxxxxxxxxxxx; Florian Westphal <fw@xxxxxxxxx>; > Marcelo Ricardo Leitner <mleitner@xxxxxxxxxx>; Long Xin > <lxin@xxxxxxxxxx> > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine > > Sriram Yagnaraman <sriram.yagnaraman@xxxxxxxx> wrote: > > All the paths in an SCTP connection are kept alive either by actual > > DATA/SACK running through the connection or by HEARTBEAT. This patch > > proposes a simple state machine with only two states OPEN_WAIT and > > ESTABLISHED (similar to UDP). The reason for this change is a full > > stateful approach to SCTP is difficult when the association is > > multihomed since the endpoints could use different paths in the > > network during the lifetime of an association. > > > > Default timeouts are: > > OPEN_WAIT: 3 seconds (rto_initial) > > ESTABLISHED: 210 seconds (rto_max + hb_interval * path_max_retrans) > > > > Important changes/notes > > - Timeout is used to clean up conntrack entries > > - VTAG checks are kept as is (can be moved to a conntrack extension if > > desired) > > - SCTP chunks are parsed only once, and a map is populated with the > > information on the chunks present in the packet > > - ASSURED bit is NOT set in this version of the patch, need help > > understanding when to set it > > > > Note that this patch has changed uapi headers. > > Don't do that please, this will cause trouble. Yes, I understand, will remove the changes and reuse COOKIE_WAIT value for OPEN_WAIT. > > > Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@xxxxxxxx> > > --- > > .../uapi/linux/netfilter/nf_conntrack_sctp.h | 10 +- > > .../linux/netfilter/nfnetlink_cttimeout.h | 10 +- > > net/netfilter/nf_conntrack_proto_sctp.c | 589 ++++-------------- > > net/netfilter/nf_conntrack_standalone.c | 72 +-- > > 4 files changed, 143 insertions(+), 538 deletions(-) > > > > diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h > > b/include/uapi/linux/netfilter/nf_conntrack_sctp.h > > index c742469afe21..89381a57021a 100644 > > --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h > > +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h > > @@ -7,16 +7,8 @@ > > > > enum sctp_conntrack { > > SCTP_CONNTRACK_NONE, > > - SCTP_CONNTRACK_CLOSED, > > - SCTP_CONNTRACK_COOKIE_WAIT, > > - SCTP_CONNTRACK_COOKIE_ECHOED, > > + SCTP_CONNTRACK_OPEN_WAIT, > > SCTP_CONNTRACK_ESTABLISHED, > > - SCTP_CONNTRACK_SHUTDOWN_SENT, > > - SCTP_CONNTRACK_SHUTDOWN_RECD, > > - SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, > > - SCTP_CONNTRACK_HEARTBEAT_SENT, > > - SCTP_CONNTRACK_HEARTBEAT_ACKED, > > - SCTP_CONNTRACK_DATA_SENT, > > SCTP_CONNTRACK_MAX > > Please keep all as-is. > > You might want to add a /* no loner used */ or similar. > > You could hijack an existing enum to avoid adding a new one: > > SCTP_CONNTRACK_OPEN_WAIT = SCTP_CONNTRACK_COOKIE_WAIT, > > > diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h > > b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h > > index 94e74034706d..372dfe7c07ed 100644 > > --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h > > +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h > > @@ -86,16 +86,8 @@ enum ctattr_timeout_dccp { > > > > enum ctattr_timeout_sctp { > > CTA_TIMEOUT_SCTP_UNSPEC, > > - CTA_TIMEOUT_SCTP_CLOSED, > > - CTA_TIMEOUT_SCTP_COOKIE_WAIT, > > - CTA_TIMEOUT_SCTP_COOKIE_ECHOED, > > + CTA_TIMEOUT_SCTP_OPEN_WAIT, > > CTA_TIMEOUT_SCTP_ESTABLISHED, > > - CTA_TIMEOUT_SCTP_SHUTDOWN_SENT, > > - CTA_TIMEOUT_SCTP_SHUTDOWN_RECD, > > - CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, > > - CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, > > - CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, > > - CTA_TIMEOUT_SCTP_DATA_SENT, > > __CTA_TIMEOUT_SCTP_MAX > > Same, this is frozen, you can add to it but you cannot remove this. > > You can add a kernel internal enum if you like, to replace the existing ones, > with kernel mapping the new ones to old (and ignoring the old ones on input > from userspace). > > This would allow to shrink struct nf_sctp_net size for example. > > > #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff -- > git > > a/net/netfilter/nf_conntrack_proto_sctp.c > > b/net/netfilter/nf_conntrack_proto_sctp.c > > index d88b92a8ffca..d79ed476b764 100644 > > --- a/net/netfilter/nf_conntrack_proto_sctp.c > > +++ b/net/netfilter/nf_conntrack_proto_sctp.c > > @@ -5,12 +5,13 @@ > > * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@xxxxxxxxx> > > * Copyright (c) 2004-2012 Patrick McHardy <kaber@xxxxxxxxx> > > * > > - * SCTP is defined in RFC 2960. References to various sections in > > this code > > + * SCTP is defined in RFC 4960. References to various sections in > > + this code > > * are to this RFC. > > */ > > > > #include <linux/types.h> > > #include <linux/timer.h> > > +#include <linux/jiffies.h> > > #include <linux/netfilter.h> > > #include <linux/in.h> > > #include <linux/ip.h> > > @@ -27,127 +28,19 @@ > > #include <net/netfilter/nf_conntrack_ecache.h> > > #include <net/netfilter/nf_conntrack_timeout.h> > > > > -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more > > - closely. They're more complex. --RR > > - > > - And so for me for SCTP :D -Kiran */ > > +#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 > > > > static const char *const sctp_conntrack_names[] = { > > "NONE", > > - "CLOSED", > > - "COOKIE_WAIT", > > - "COOKIE_ECHOED", > > + "OPEN_WAIT", > > "ESTABLISHED", > > - "SHUTDOWN_SENT", > > - "SHUTDOWN_RECD", > > - "SHUTDOWN_ACK_SENT", > > - "HEARTBEAT_SENT", > > - "HEARTBEAT_ACKED", > > }; > > > - } > > + [SCTP_CONNTRACK_OPEN_WAIT] = 3 SECS, > > + [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, > > }; > > > > > +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ > > +for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ > > + (offset) < (skb)->len && \ > > I think skb_header_pointer() will return NULL if offset + sizeof(_sch) exceeds > skb->len, so this offset < skb->len test is redundant. > > > + (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) > > What if sch->length == 0? Oops, infinite loop. I will fix it. > > > + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { > > + pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); > > Is this pr_debug() needed? Its pretty useless because it would print for every > packet (its not an error path). Removed now. > > > + set_bit(sch->type, map); > > > > - if (do_basic_checks(ct, skb, dataoff, map) != 0) > > - goto out; > > + if (sch->type == SCTP_CID_INIT || > > + sch->type == SCTP_CID_INIT_ACK) { > > + struct sctp_inithdr _inith, *inith; > > + inith = skb_header_pointer(skb, offset + sizeof(_sch), > > + sizeof(_inith), &_inith); > > + if (inith) > > + init_vtag = inith->init_tag; > > + else > > + goto out_drop; > > if (!inith) > goto out_drop; > > init_vtag = inith->init_tag; > > Also, please run your patch through scripts/checkpatch.pl script, I'm sure > there are several coding style warnings here. > > > + spin_lock_bh(&ct->lock); > > Why is this spinlock needed? I have reworked the code a bit now, and will use the spinlock only for the parts that write to ct. > > > if (!nf_ct_is_confirmed(ct)) { > > /* If an OOTB packet has any of these chunks discard (Sec 8.4) > */ > > if (test_bit(SCTP_CID_ABORT, map) || > > test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || > > test_bit(SCTP_CID_COOKIE_ACK, map)) > > - return -NF_ACCEPT; > > + goto out_unlock; > > > > - if (!sctp_new(ct, skb, sh, dataoff)) > > - return -NF_ACCEPT; > > Any reason for deleting sctp_new()? > It makes this body a lot larger, the lines below could have been done in > sctp_new(). I will bring back sctp_new() and move vtag checks into its own function sctp_vtag_check(). > > > + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); > > + ct->proto.sctp.state = SCTP_CONNTRACK_OPEN_WAIT; > > + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); > > + > > + if (test_bit(SCTP_CID_INIT, map)) > > + ct->proto.sctp.vtag[!dir] = init_vtag; > > + else if (test_bit(SCTP_CID_SHUTDOWN_ACK, map)) > > + /* If it is a shutdown ack OOTB packet, we expect a > return > > + shutdown complete, otherwise an ABORT Sec 8.4 (5) > and (8) */ > > + ct->proto.sctp.vtag[!dir] = sctph->vtag; > > + else > > + ct->proto.sctp.vtag[dir] = sctph->vtag; > > Maybe the else branch below can be elided by adding a goto here? > > AFAICS the spinlock is only needed for some parts of the else branch, so the > spin_lock_bh can be moved. > > + /* we have seen traffic both ways, go to established */ > > + if (dir == IP_CT_DIR_REPLY && > > + ct->proto.sctp.state == > SCTP_CONNTRACK_OPEN_WAIT) { > > + ct->proto.sctp.state = > SCTP_CONNTRACK_ESTABLISHED; > > + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); > > > + /* Check the verification tag (Sec 8.5) */ > > + if (!test_bit(SCTP_CID_INIT, map) && > > + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && > > + !test_bit(SCTP_CID_COOKIE_ECHO, map) && > > + !test_bit(SCTP_CID_ABORT, map) && > > + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && > > + !test_bit(SCTP_CID_HEARTBEAT, map) && > > + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && > > + sctph->vtag != ct->proto.sctp.vtag[dir]) { > > + pr_debug("Verification tag check failed\n"); > > Please have a look at > https://patchwork.ozlabs.org/project/netfilter- > devel/patch/20230102114612.15860-2-fw@xxxxxxxxx/ > > I hope it will be applied shortly so you can rebase. > I don't have any other sctp patches. > > This should be > nf_ct_l4proto_log_invalid(skb, ct, state, > "verification tag check failed %x vs %x for dir %d", > sh->vtag, ct->proto.sctp.vtag[dir], dir); > > instead of pr_debug(). Yes, will do. Will also remove other pr_debug() if any. > > > + /* Special cases of Verification tag check (Sec 8.5.1) */ > > Please extend the comments a bit so I don't have to look at the RFC while > reviewing, just quote the relevant part, i.e. > > > + if (test_bit(SCTP_CID_INIT, map)) { > > + /* Sec 8.5.1 (A) */ > > + if (sctph->vtag != 0) > > goto out_unlock; > > - } > > if (sctph->vtag != 0) /* A) init vtag MUST be 0 */ > goto out_unlock; > ö > > + else if (nf_ct_is_confirmed(ct)) > > No need to 'else if', just use 'if'. > > > + /* Need some thought on how to set the assured bit */ > > + // if (dir == IP_CT_DIR_REPLY && > > + // !(test_bit(IPS_ASSURED_BIT, &ct->status))) { > > + // set_bit(IPS_ASSURED_BIT, &ct->status); > > + // nf_conntrack_event_cache(IPCT_ASSURED, ct); > > Probably do a test_and_set_bit() when the connection switches to > ESTABLISHED? Ok, makes sense. > > > sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { > > - [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, > > + [CTA_TIMEOUT_SCTP_OPEN_WAIT] = { .type = NLA_U32 }, > > [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, > > - [CTA_TIMEOUT_SCTP_DATA_SENT] = { .type = NLA_U32 }, > > Please retain this as-is for now. > > I'm fine with removing the sysctls though.