Set SO_SNDBUF before SO_{SND,RCV}BUFFORCE: Unpriviledged namespace do not have CAP_NET_ADMIN on the host (user_init_ns) namespace. SO_SNDBUF always succeeds in Linux, always try SO_{SND,RCV}BUFFORCE after it. Moreover, suggest the user to bump socket limits if EMSGSIZE is hit. Provide a hint to the user too: # nft -f test.nft netlink: Error: Could not process rule: Message too long Please, rise /proc/sys/net/core/wmem_max on the host namespace. Hint: 4194304 bytes Dave Pfike says: Prior to this patch, nft inside a systemd-nspawn container was failing to install my ruleset (which includes a large-ish map), with the error netlink: Error: Could not process rule: Message too long strace reveals: setsockopt(3, SOL_SOCKET, SO_SNDBUFFORCE, [524288], 4) = -1 EPERM (Operation not permitted) This is despite the nspawn process supposedly having CAP_NET_ADMIN, and despite /proc/sys/net/core/wmem_max (in the main host namespace) being set larger than the requested size: net.core.wmem_max = 16777216 A web search reveals at least one other user having the same issue: https://old.reddit.com/r/Proxmox/comments/scnoav/lxc_container_debian_11_nftables_geoblocking/ Reported-by: Dave Pifke <dave@xxxxxxxxx> Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> --- include/netlink.h | 1 + include/utils.h | 1 + src/libnftables.c | 7 +++++++ src/mnl.c | 17 ++++++++++++----- src/utils.c | 12 ++++++++++++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/include/netlink.h b/include/netlink.h index 0d97f71ccff3..d52434c72bc2 100644 --- a/include/netlink.h +++ b/include/netlink.h @@ -84,6 +84,7 @@ struct netlink_ctx { const void *data; uint32_t seqnum; struct nftnl_batch *batch; + int maybe_emsgsize; }; extern struct nftnl_expr *alloc_nft_expr(const char *name); diff --git a/include/utils.h b/include/utils.h index ffbe2cbb75be..7782640888c9 100644 --- a/include/utils.h +++ b/include/utils.h @@ -136,5 +136,6 @@ extern void *xzalloc(size_t size); extern void *xzalloc_array(size_t nmemb, size_t size); extern char *xstrdup(const char *s); extern void xstrunescape(const char *in, char *out); +extern int round_pow_2(int value); #endif /* NFTABLES_UTILS_H */ diff --git a/src/libnftables.c b/src/libnftables.c index 4f538c44b998..de16d203a017 100644 --- a/src/libnftables.c +++ b/src/libnftables.c @@ -55,6 +55,13 @@ static int nft_netlink(struct nft_ctx *nft, ret = mnl_batch_talk(&ctx, &err_list, num_cmds); if (ret < 0) { + if (ctx.maybe_emsgsize && errno == EMSGSIZE) { + netlink_io_error(&ctx, NULL, + "Could not process rule: %s\n" + "Please, rise /proc/sys/net/core/wmem_max on the host namespace. Hint: %d bytes", + strerror(errno), round_pow_2(ctx.maybe_emsgsize)); + goto out; + } netlink_io_error(&ctx, NULL, "Could not process rule: %s", strerror(errno)); goto out; diff --git a/src/mnl.c b/src/mnl.c index 26f943dbb4c8..ce9e4ee1c059 100644 --- a/src/mnl.c +++ b/src/mnl.c @@ -245,9 +245,10 @@ void mnl_err_list_free(struct mnl_err *err) xfree(err); } -static void mnl_set_sndbuffer(const struct mnl_socket *nl, - struct nftnl_batch *batch) +static void mnl_set_sndbuffer(struct netlink_ctx *ctx) { + struct mnl_socket *nl = ctx->nft->nf_sock; + struct nftnl_batch *batch = ctx->batch; socklen_t len = sizeof(int); int sndnlbuffsiz = 0; int newbuffsiz; @@ -260,9 +261,15 @@ static void mnl_set_sndbuffer(const struct mnl_socket *nl, return; /* Rise sender buffer length to avoid hitting -EMSGSIZE */ + setsockopt(mnl_socket_get_fd(nl), SOL_SOCKET, SO_SNDBUF, + &newbuffsiz, sizeof(socklen_t)); + + /* unpriviledged containers check for CAP_NET_ADMIN on the init_user_ns. */ if (setsockopt(mnl_socket_get_fd(nl), SOL_SOCKET, SO_SNDBUFFORCE, - &newbuffsiz, sizeof(socklen_t)) < 0) - return; + &newbuffsiz, sizeof(socklen_t)) < 0) { + if (errno == EPERM) + ctx->maybe_emsgsize = newbuffsiz; + } } static unsigned int nlsndbufsiz; @@ -409,7 +416,7 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list, .nl_ctx = ctx, }; - mnl_set_sndbuffer(ctx->nft->nf_sock, ctx->batch); + mnl_set_sndbuffer(ctx); mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len); diff --git a/src/utils.c b/src/utils.c index 925841c571f5..c32ac220b127 100644 --- a/src/utils.c +++ b/src/utils.c @@ -100,3 +100,15 @@ void xstrunescape(const char *in, char *out) } out[k++] = '\0'; } + +int round_pow_2(int value) +{ + value--; + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + + return value + 1; +} -- 2.30.2