On Fri, Jan 08, 2021 at 01:02:22PM -0800, Stanislav Fomichev wrote: > When we attach a bpf program to cgroup/getsockopt any other getsockopt() > syscall starts incurring kzalloc/kfree cost. > > Let add a small buffer on the stack and use it for small (majority) > {s,g}etsockopt values. The buffer is small enough to fit into > the cache line and cover the majority of simple options (most > of them are 4 byte ints). > > It seems natural to do the same for setsockopt, but it's a bit more > involved when the BPF program modifies the data (where we have to > kmalloc). The assumption is that for the majority of setsockopt > calls (which are doing pure BPF options or apply policy) this > will bring some benefit as well. > > Without this patch (we remove about 1% __kmalloc): > 3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt > | > --3.30%--__cgroup_bpf_run_filter_getsockopt > | > --0.81%--__kmalloc > > Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx> > Cc: Martin KaFai Lau <kafai@xxxxxx> > Cc: Song Liu <songliubraving@xxxxxx> > --- > include/linux/filter.h | 5 ++++ > kernel/bpf/cgroup.c | 52 ++++++++++++++++++++++++++++++++++++------ > 2 files changed, 50 insertions(+), 7 deletions(-) > > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 29c27656165b..8739f1d4cac4 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -1281,6 +1281,11 @@ struct bpf_sysctl_kern { > u64 tmp_reg; > }; > > +#define BPF_SOCKOPT_KERN_BUF_SIZE 32 > +struct bpf_sockopt_buf { > + u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; > +}; > + > struct bpf_sockopt_kern { > struct sock *sk; > u8 *optval; > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > index c41bb2f34013..a9aad9c419e1 100644 > --- a/kernel/bpf/cgroup.c > +++ b/kernel/bpf/cgroup.c > @@ -1298,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, > return empty; > } > > -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) > +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, > + struct bpf_sockopt_buf *buf) > { > if (unlikely(max_optlen < 0)) > return -EINVAL; > @@ -1310,6 +1311,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) > max_optlen = PAGE_SIZE; > } > > + if (max_optlen <= sizeof(buf->data)) { > + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE > + * bytes avoid the cost of kzalloc. > + */ > + ctx->optval = buf->data; > + ctx->optval_end = ctx->optval + max_optlen; > + return max_optlen; > + } > + > ctx->optval = kzalloc(max_optlen, GFP_USER); > if (!ctx->optval) > return -ENOMEM; > @@ -1319,16 +1329,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) > return max_optlen; > } > > -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) > +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, > + struct bpf_sockopt_buf *buf) > { > + if (ctx->optval == buf->data) > + return; > kfree(ctx->optval); > } > > +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, > + struct bpf_sockopt_buf *buf) > +{ > + return ctx->optval != buf->data; > +} > + > int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, > int *optname, char __user *optval, > int *optlen, char **kernel_optval) > { > struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); > + struct bpf_sockopt_buf buf = {}; > struct bpf_sockopt_kern ctx = { > .sk = sk, > .level = *level, > @@ -1350,7 +1370,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, > */ > max_optlen = max_t(int, 16, *optlen); > > - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); > + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); > if (max_optlen < 0) > return max_optlen; > > @@ -1390,13 +1410,30 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, > */ > if (ctx.optlen != 0) { When ctx.optlen == 0, is sockopt_free_buf() called? Did I miss something? > *optlen = ctx.optlen; > - *kernel_optval = ctx.optval; > + /* We've used bpf_sockopt_kern->buf as an intermediary > + * storage, but the BPF program indicates that we need > + * to pass this data to the kernel setsockopt handler. > + * No way to export on-stack buf, have to allocate a > + * new buffer. > + */ > + if (!sockopt_buf_allocated(&ctx, &buf)) { > + void *p = kzalloc(ctx.optlen, GFP_USER); > + > + if (!p) { > + ret = -ENOMEM; > + goto out; > + } > + memcpy(p, ctx.optval, ctx.optlen); > + *kernel_optval = p; > + } else { > + *kernel_optval = ctx.optval; > + } > } > } > > out: > if (ret) > - sockopt_free_buf(&ctx); > + sockopt_free_buf(&ctx, &buf); > return ret; > } >