On Tue, Sep 10, 2019 at 12:52:42PM +0200, Florian Westphal wrote: > Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> wrote: > > On Tue, Sep 10, 2019 at 12:19:18AM +0200, Florian Westphal wrote: > > [...] > > > diff --git a/src/mnl.c b/src/mnl.c > > > index 9c1f5356c9b9..d664564e16af 100644 > > > --- a/src/mnl.c > > > +++ b/src/mnl.c > > > @@ -311,8 +311,6 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list, > > > int ret, fd = mnl_socket_get_fd(nl), portid = mnl_socket_get_portid(nl); > > > uint32_t iov_len = nftnl_batch_iovec_len(ctx->batch); > > > char rcv_buf[MNL_SOCKET_BUFFER_SIZE]; > > > - unsigned int enobuf_restarts = 0; > > > - size_t avg_msg_size, batch_size; > > > const struct sockaddr_nl snl = { > > > .nl_family = AF_NETLINK > > > }; > > > @@ -321,17 +319,22 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list, > > > .tv_usec = 0 > > > }; > > > struct iovec iov[iov_len]; > > > - unsigned int scale = 4; > > > struct msghdr msg = {}; > > > fd_set readfds; > > > > > > mnl_set_sndbuffer(ctx->nft->nf_sock, ctx->batch); > > > > > > - batch_size = mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len); > > > - avg_msg_size = div_round_up(batch_size, num_cmds); > > > + mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len); > > > > > > -restart: > > > - mnl_set_rcvbuffer(ctx->nft->nf_sock, num_cmds * avg_msg_size * scale); > > > + if (nft_output_echo(&ctx->nft->output)) { > > > + size_t buffer_size = MNL_SOCKET_BUFFER_SIZE * 1024; > > > + size_t new_buffer_size = num_cmds * 1024; > > > > Probably all simplify this to? > > > > mnl_set_rcvbuffer(ctx->nft->nf_sock, (1 << 10) * num_cmds); > > Reason for above patch was to avoid any risk for normal operations by > restricting the recvbuffer tuning to echo-mode and also adding a > lower thresh. > > For some reason I don't like the idea of setting only 1k recvbuf by > default in the extreme case. I'd still like to keep setting the receive buffer for the non-echo case, a ruleset with lots of acknowledments (lots of errors) might hit ENOBUFS, I remember that was reproducible. Probably this? it's based on your patch.
diff --git a/src/mnl.c b/src/mnl.c index 9c1f5356c9b9..8031bd6add80 100644 --- a/src/mnl.c +++ b/src/mnl.c @@ -304,6 +304,8 @@ static ssize_t mnl_nft_socket_sendmsg(struct netlink_ctx *ctx, return sendmsg(mnl_socket_get_fd(ctx->nft->nf_sock), msg, 0); } +#define NFT_MNL_ECHO_RCVBUFF_DEFAULT (MNL_SOCKET_BUFFER_SIZE * 1024) + int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list, uint32_t num_cmds) { @@ -311,8 +313,6 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list, int ret, fd = mnl_socket_get_fd(nl), portid = mnl_socket_get_portid(nl); uint32_t iov_len = nftnl_batch_iovec_len(ctx->batch); char rcv_buf[MNL_SOCKET_BUFFER_SIZE]; - unsigned int enobuf_restarts = 0; - size_t avg_msg_size, batch_size; const struct sockaddr_nl snl = { .nl_family = AF_NETLINK }; @@ -321,17 +321,24 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list, .tv_usec = 0 }; struct iovec iov[iov_len]; - unsigned int scale = 4; struct msghdr msg = {}; + unsigned int rcvbufsiz; + size_t batch_size; fd_set readfds; mnl_set_sndbuffer(ctx->nft->nf_sock, ctx->batch); batch_size = mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len); - avg_msg_size = div_round_up(batch_size, num_cmds); -restart: - mnl_set_rcvbuffer(ctx->nft->nf_sock, num_cmds * avg_msg_size * scale); + if (nft_output_echo(&ctx->nft->output)) { + rcvbufsiz = num_cmds * 1024; + if (rcvbufsiz < NFT_MNL_ECHO_RCVBUFF_DEFAULT) + rcvbufsiz = NFT_MNL_ECHO_RCVBUFF_DEFAULT; + } else { + rcvbufsiz = num_cmds * div_round_up(batch_size, num_cmds) * 4; + } + + mnl_set_rcvbuffer(ctx->nft->nf_sock, rcvbufsiz); ret = mnl_nft_socket_sendmsg(ctx, &msg); if (ret == -1) @@ -350,13 +357,8 @@ restart: break; ret = mnl_socket_recvfrom(nl, rcv_buf, sizeof(rcv_buf)); - if (ret == -1) { - if (errno == ENOBUFS && enobuf_restarts++ < 3) { - scale *= 2; - goto restart; - } + if (ret == -1) return -1; - } ret = mnl_cb_run(rcv_buf, ret, 0, portid, &netlink_echo_callback, ctx); /* Continue on error, make sure we get all acknowledgments */