Re: [RFC bpf-next v3 5/5] selftests/bpf: Add test cases for sleepable BPF programs of the CGROUP_SOCKOPT type

Kui-Feng Lee <sinquersw@xxxxxxxxx> · Thu, 17 Aug 2023 11:36:30 -0700






On 8/16/23 18:13, Martin KaFai Lau wrote:
On 8/15/23 5:03 PM, Kui-Feng Lee wrote:
+SEC("cgroup/getsockopt.s")
+int _getsockopt_s(struct bpf_sockopt *ctx)
+{
+    struct tcp_zerocopy_receive *zcvr;
+    struct bpf_dynptr optval_dynptr;
+    struct sockopt_sk *storage;
+    __u8 *optval, *optval_end;
+    struct bpf_sock *sk;
+    char buf[1];
+    __u64 addr;
+    int ret;
+
+    if (skip_sleepable)
+        return 1;
+
+    /* Bypass AF_NETLINK. */
+    sk = ctx->sk;
+    if (sk && sk->family == AF_NETLINK)
+        return 1;
+
+    optval = ctx->optval;
+    optval_end = ctx->optval_end;
+
+    /* Make sure bpf_get_netns_cookie is callable.
+     */
+    if (bpf_get_netns_cookie(NULL) == 0)
+        return 0;
+
+    if (bpf_get_netns_cookie(ctx) == 0)
+        return 0;
+
+    if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
+        /* Not interested in SOL_IP:IP_TOS;
+         * let next BPF program in the cgroup chain or kernel
+         * handle it.
+         */
+        return 1;
+    }
+
+    if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+        /* Not interested in SOL_SOCKET:SO_SNDBUF;
+         * let next BPF program in the cgroup chain or kernel
+         * handle it.
+         */
+        return 1;
+    }
+
+    if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) {
+        /* Not interested in SOL_TCP:TCP_CONGESTION;
+         * let next BPF program in the cgroup chain or kernel
+         * handle it.
+         */
+        return 1;
+    }
+
+    if (ctx->level == SOL_TCP && ctx->optname == 
TCP_ZEROCOPY_RECEIVE) {
+        /* Verify that TCP_ZEROCOPY_RECEIVE triggers.
+         * It has a custom implementation for performance
+         * reasons.
+         */
+
+        bpf_sockopt_dynptr_from(ctx, &optval_dynptr, sizeof(*zcvr));
+        zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr));
+        addr = zcvr ? zcvr->address : 0;
+        bpf_sockopt_dynptr_release(ctx, &optval_dynptr);

This starts to look more usable, thank you for the changes!
Let me poke the api a bit more, I'm not super familiar with the 
dynptrs.

here: bpf_sockopt_dynptr_from should probably be called
bpf_dynptr_from_sockopt to match bpf_dynptr_from_mem?

agree!


+
+        return addr != 0 ? 0 : 1;
+    }
+
+    if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+        if (optval + 1 > optval_end)
+            return 0; /* bounds check */
+
+        ctx->retval = 0; /* Reset system call return value to zero */
+
+        /* Always export 0x55 */
+        buf[0] = 0x55;
+        ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr);
+        if (ret >= 0) {
+            bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0);
+            ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr);
+        }
+        bpf_sockopt_dynptr_release(ctx, &optval_dynptr);

Does bpf_sockopt_dynptr_alloc and bpf_sockopt_dynptr_release need to be
sockopt specific? Seems like we should provide, instead, some generic
bpf_dynptr_alloc/bpf_dynptr_release and make
bpf_sockopt_dynptr_copy_to/install work with them? WDYT?

I found that kmalloc can not be called in the context of nmi, slab or
page alloc path. It is why we don't have functions like
bpf_dynptr_alloc/bpf_dynptr_release yet. That means we need someone
to implement an allocator for BPF programs. And, then, we can have
bpf_dynptr_alloc unpon it. (There is an implementation of
bpf_dynptr_alloc() in the early versions of the patchset of dynptr.
But, be removed before landing.)



+        if (ret < 0)
+            return 0;
+        ctx->optlen = 1;
+
+        /* Userspace buffer is PAGE_SIZE * 2, but BPF
+         * program can only see the first PAGE_SIZE
+         * bytes of data.
+         */
+        if (optval_end - optval != page_size && 0)
+            return 0; /* unexpected data size */
+
+        return 1;
+    }
+
+    if (ctx->level != SOL_CUSTOM)
+        return 0; /* deny everything except custom level */
+
+    if (optval + 1 > optval_end)
+        return 0; /* bounds check */
+
+    storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
+                     BPF_SK_STORAGE_GET_F_CREATE);
+    if (!storage)
+        return 0; /* couldn't get sk storage */
+
+    if (!ctx->retval)
+        return 0; /* kernel should not have handled
+               * SOL_CUSTOM, something is wrong!
+               */
+    ctx->retval = 0; /* Reset system call return value to zero */
+
+    buf[0] = storage->val;
+    ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr);
+    if (ret >= 0) {
+        bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0);
+        ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr);
+    }
+    bpf_sockopt_dynptr_release(ctx, &optval_dynptr);
+    if (ret < 0)
+        return 0;
+    ctx->optlen = 1;
+
+    return 1;
+}
+
  SEC("cgroup/setsockopt")
  int _setsockopt(struct bpf_sockopt *ctx)
  {
@@ -144,6 +281,9 @@ int _setsockopt(struct bpf_sockopt *ctx)
      struct sockopt_sk *storage;
      struct bpf_sock *sk;
+    if (skip_nonsleepable)
+        return 1;
+
      /* Bypass AF_NETLINK. */
      sk = ctx->sk;
      if (sk && sk->family == AF_NETLINK)
@@ -236,3 +376,120 @@ int _setsockopt(struct bpf_sockopt *ctx)
          ctx->optlen = 0;
      return 1;
  }
+
+SEC("cgroup/setsockopt.s")
+int _setsockopt_s(struct bpf_sockopt *ctx)
+{
+    struct bpf_dynptr optval_buf;
+    struct sockopt_sk *storage;
+    __u8 *optval, *optval_end;
+    struct bpf_sock *sk;
+    __u8 tmp_u8;
+    __u32 tmp;
+    int ret;
+
+    if (skip_sleepable)
+        return 1;
+
+    optval = ctx->optval;
+    optval_end = ctx->optval_end;
+
+    /* Bypass AF_NETLINK. */
+    sk = ctx->sk;
+    if (sk && sk->family == AF_NETLINK)
+        return -1;
+
+    /* Make sure bpf_get_netns_cookie is callable.
+     */
+    if (bpf_get_netns_cookie(NULL) == 0)
+        return 0;
+
+    if (bpf_get_netns_cookie(ctx) == 0)
+        return 0;
+
+    if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
+        /* Not interested in SOL_IP:IP_TOS;
+         * let next BPF program in the cgroup chain or kernel
+         * handle it.
+         */
+        ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
+        return 1;
+    }
+
+    if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+        /* Overwrite SO_SNDBUF value */
+
+        ret = bpf_sockopt_dynptr_alloc(ctx, sizeof(__u32),
+                           &optval_buf);
+        if (ret < 0)
+            bpf_sockopt_dynptr_release(ctx, &optval_buf);
+        else {
+            tmp = 0x55AA;
+            bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0);
+            ret = bpf_sockopt_dynptr_install(ctx, &optval_buf);

One thing I'm still slightly confused about is
bpf_sockopt_dynptr_install vs bpf_sockopt_dynptr_copy_to. I do
understand that it comes from getsockopt vs setsockopt (and the 
ability,
in setsockopt, to allocate larger buffers), but I wonder whether
we can hide everything under single bpf_sockopt_dynptr_copy_to call?

For getsockopt, it stays as is. For setsockopt, it would work like
_install currently does. Would that work? From user perspective,
if we provide a simple call that does the right thing, seems a bit
more usable? The only problem is probably the fact that _install
explicitly moves the ownership, but I don't see why copy_to can't
have the same "consume" semantics?

Sorry for missing this part!
This overloading is counterintuitive for me.
*_copy_to() will not copy/overwrite the buffer, but replace the buffer
instead. And, it will change its side-effects according to its context.
I would prefer a different name instead of reusing *_copy_to().

We probably need a better name, instead of copy_to, in order to merge
these two functions if we want to.

It also took me some time to realize the alloc/install/copy_to/release 
usages. iiuc, to change optval in getsockopt, it is 
alloc=>write=>copy_to=>release.
       to change optval in setsockopt, it is alloc=>write=>install.

Can this alloc and user-or-kernel memory details be done in the 
bpf_dynptr_write() instead such that both BPF_CGROUP_GETSOCKOPT and 
BPF_CGROUP_SETSOCKOPT program can just call one bpf_dynptr_write() to 
update the optval? I meant bpf_dynptr_write() should have the needed 
context to decide if it can directly write to the __user ptr or it needs 
to write to a kernel memory and if kmalloc is needed/allowed.

For v3, you can just call  bpf_dynptr_write() to update the buffer
without installing it if optval is already a in kernel buffer.
But, you can still call *_install() even if it is unnecessary.
The code in the test case try to make it less complicated. So,
it always call *_install() without any check.

Same for reading. bpf_dynptr_read() should know it should read from user 
or kernel memory. bpf_dynptr_data() may not work well if the underlying 
sockopt memory is still backed by user memory, so probably don't support 
bpf_dynptr_data() for sockopt. Support the bpf_dynptr_slice() and 
bpf_dynptr_slice_rdwr() instead. Take a look at how the "buffer__opt" 
arg is used by the xdp dynptr. If the underlying sockopt is still backed 
by user memory, the bpf_dynptr_slice() can do a copy_from_user to the 
"buffer__opt".


bpf_dynptr_from_cgrp_sockopt() should be the only function to initalize 
a dynptr from the 'struct bpf_sockopt *ctx'. Probably don't need to 
allocate any memory at the init time.

Is there something specific to cgrp sockopt that is difficult and any 
downside of the above?

WDYT?


This is doable.