Re: [PATCH nft] src: mnl: fix --echo buffer size -- again

Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> · Tue, 10 Sep 2019 13:22:54 +0200

On Tue, Sep 10, 2019 at 12:52:42PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> wrote:
> > On Tue, Sep 10, 2019 at 12:19:18AM +0200, Florian Westphal wrote:
> > [...]
> > > diff --git a/src/mnl.c b/src/mnl.c
> > > index 9c1f5356c9b9..d664564e16af 100644
> > > --- a/src/mnl.c
> > > +++ b/src/mnl.c
> > > @@ -311,8 +311,6 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
> > >  	int ret, fd = mnl_socket_get_fd(nl), portid = mnl_socket_get_portid(nl);
> > >  	uint32_t iov_len = nftnl_batch_iovec_len(ctx->batch);
> > >  	char rcv_buf[MNL_SOCKET_BUFFER_SIZE];
> > > -	unsigned int enobuf_restarts = 0;
> > > -	size_t avg_msg_size, batch_size;
> > >  	const struct sockaddr_nl snl = {
> > >  		.nl_family = AF_NETLINK
> > >  	};
> > > @@ -321,17 +319,22 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
> > >  		.tv_usec	= 0
> > >  	};
> > >  	struct iovec iov[iov_len];
> > > -	unsigned int scale = 4;
> > >  	struct msghdr msg = {};
> > >  	fd_set readfds;
> > >  
> > >  	mnl_set_sndbuffer(ctx->nft->nf_sock, ctx->batch);
> > >  
> > > -	batch_size = mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len);
> > > -	avg_msg_size = div_round_up(batch_size, num_cmds);
> > > +	mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len);
> > >  
> > > -restart:
> > > -	mnl_set_rcvbuffer(ctx->nft->nf_sock, num_cmds * avg_msg_size * scale);
> > > +	if (nft_output_echo(&ctx->nft->output)) {
> > > +		size_t buffer_size = MNL_SOCKET_BUFFER_SIZE * 1024;
> > > +		size_t new_buffer_size = num_cmds * 1024;
> > 
> > Probably all simplify this to?
> > 
> > 		mnl_set_rcvbuffer(ctx->nft->nf_sock, (1 << 10) * num_cmds);
> 
> Reason for above patch was to avoid any risk for normal operations by
> restricting the recvbuffer tuning to echo-mode and also adding a
> lower thresh.
> 
> For some reason I don't like the idea of setting only 1k recvbuf by
> default in the extreme case.

I'd still like to keep setting the receive buffer for the non-echo
case, a ruleset with lots of acknowledments (lots of errors) might hit
ENOBUFS, I remember that was reproducible.

Probably this? it's based on your patch.

diff --git a/src/mnl.c b/src/mnl.c
index 9c1f5356c9b9..8031bd6add80 100644
--- a/src/mnl.c
+++ b/src/mnl.c
@@ -304,6 +304,8 @@ static ssize_t mnl_nft_socket_sendmsg(struct netlink_ctx *ctx,
 	return sendmsg(mnl_socket_get_fd(ctx->nft->nf_sock), msg, 0);
 }
 
+#define NFT_MNL_ECHO_RCVBUFF_DEFAULT	(MNL_SOCKET_BUFFER_SIZE * 1024)
+
 int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
 		   uint32_t num_cmds)
 {
@@ -311,8 +313,6 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
 	int ret, fd = mnl_socket_get_fd(nl), portid = mnl_socket_get_portid(nl);
 	uint32_t iov_len = nftnl_batch_iovec_len(ctx->batch);
 	char rcv_buf[MNL_SOCKET_BUFFER_SIZE];
-	unsigned int enobuf_restarts = 0;
-	size_t avg_msg_size, batch_size;
 	const struct sockaddr_nl snl = {
 		.nl_family = AF_NETLINK
 	};
@@ -321,17 +321,24 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
 		.tv_usec	= 0
 	};
 	struct iovec iov[iov_len];
-	unsigned int scale = 4;
 	struct msghdr msg = {};
+	unsigned int rcvbufsiz;
+	size_t batch_size;
 	fd_set readfds;
 
 	mnl_set_sndbuffer(ctx->nft->nf_sock, ctx->batch);
 
 	batch_size = mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len);
-	avg_msg_size = div_round_up(batch_size, num_cmds);
 
-restart:
-	mnl_set_rcvbuffer(ctx->nft->nf_sock, num_cmds * avg_msg_size * scale);
+	if (nft_output_echo(&ctx->nft->output)) {
+		rcvbufsiz = num_cmds * 1024;
+		if (rcvbufsiz < NFT_MNL_ECHO_RCVBUFF_DEFAULT)
+			rcvbufsiz = NFT_MNL_ECHO_RCVBUFF_DEFAULT;
+	} else {
+		rcvbufsiz = num_cmds * div_round_up(batch_size, num_cmds) * 4;
+	}
+
+	mnl_set_rcvbuffer(ctx->nft->nf_sock, rcvbufsiz);
 
 	ret = mnl_nft_socket_sendmsg(ctx, &msg);
 	if (ret == -1)
@@ -350,13 +357,8 @@ restart:
 			break;
 
 		ret = mnl_socket_recvfrom(nl, rcv_buf, sizeof(rcv_buf));
-		if (ret == -1) {
-			if (errno == ENOBUFS && enobuf_restarts++ < 3) {
-				scale *= 2;
-				goto restart;
-			}
+		if (ret == -1)
 			return -1;
-		}
 
 		ret = mnl_cb_run(rcv_buf, ret, 0, portid, &netlink_echo_callback, ctx);
 		/* Continue on error, make sure we get all acknowledgments */