On Mon, Oct 16, 2023 at 12:08 PM -07, John Fastabend wrote: > This adds a test where both pairs of a af_unix paired socket are put into > a BPF map. This ensures that when we tear down the af_unix pair we don't > have any issues on sockmap side with ordering and reference counting. > > Signed-off-by: John Fastabend <john.fastabend@xxxxxxxxx> > --- > .../selftests/bpf/prog_tests/sockmap_listen.c | 39 ++++++++++++++++--- > .../selftests/bpf/progs/test_sockmap_listen.c | 7 ++++ > 2 files changed, 40 insertions(+), 6 deletions(-) > > diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > index 8df8cbb447f1..90e97907c1c1 100644 > --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c > @@ -1824,8 +1824,10 @@ static void inet_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, > xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); > } > > -static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, > - int verd_mapfd, enum redir_mode mode) > +static void unix_inet_redir_to_connected(int family, int type, > + int sock_mapfd, int nop_mapfd, > + int verd_mapfd, > + enum redir_mode mode) > { > const char *log_prefix = redir_mode_str(mode); > int c0, c1, p0, p1; > @@ -1849,6 +1851,12 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, > if (err) > goto close; > > + if (nop_mapfd >= 0) { > + err = add_to_sockmap(nop_mapfd, c0, c1); > + if (err) > + goto close; > + } > + > n = write(c1, "a", 1); > if (n < 0) > FAIL_ERRNO("%s: write", log_prefix); > @@ -1883,6 +1891,7 @@ static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel, > struct bpf_map *inner_map, int family) > { > int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); > + int nop_map = bpf_map__fd(skel->maps.nop_map); > int verdict_map = bpf_map__fd(skel->maps.verdict_map); > int sock_map = bpf_map__fd(inner_map); > int err; > @@ -1892,14 +1901,32 @@ static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel, > return; > > skel->bss->test_ingress = false; > - unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, > + unix_inet_redir_to_connected(family, SOCK_DGRAM, > + sock_map, -1, verdict_map, > + REDIR_EGRESS); > + unix_inet_redir_to_connected(family, SOCK_DGRAM, > + sock_map, -1, verdict_map, > REDIR_EGRESS); > - unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, > + > + unix_inet_redir_to_connected(family, SOCK_DGRAM, > + sock_map, nop_map, verdict_map, > + REDIR_EGRESS); > + unix_inet_redir_to_connected(family, SOCK_STREAM, > + sock_map, nop_map, verdict_map, > REDIR_EGRESS); > skel->bss->test_ingress = true; > - unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, > + unix_inet_redir_to_connected(family, SOCK_DGRAM, > + sock_map, -1, verdict_map, > + REDIR_INGRESS); > + unix_inet_redir_to_connected(family, SOCK_STREAM, > + sock_map, -1, verdict_map, > + REDIR_INGRESS); > + > + unix_inet_redir_to_connected(family, SOCK_DGRAM, > + sock_map, nop_map, verdict_map, > REDIR_INGRESS); > - unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, > + unix_inet_redir_to_connected(family, SOCK_STREAM, > + sock_map, nop_map, verdict_map, > REDIR_INGRESS); > > xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); > diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c > index 464d35bd57c7..b7250eb9c30c 100644 > --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c > +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c > @@ -14,6 +14,13 @@ struct { > __type(value, __u64); > } sock_map SEC(".maps"); > > +struct { > + __uint(type, BPF_MAP_TYPE_SOCKMAP); > + __uint(max_entries, 2); > + __type(key, __u32); > + __type(value, __u64); > +} nop_map SEC(".maps"); > + > struct { > __uint(type, BPF_MAP_TYPE_SOCKHASH); > __uint(max_entries, 2); So... we have a bug in unix_inet_redir_to_connected() - it happily ignores the passed socket type, which is currently hardcoded to SOCK_DGRAM :-) Which means these tests don't exercise unix_stream paths where the added psock->skpair is actually needed. But I'm able to reproduce the bug by running the VSOCK redir test: bash-5.2# ./test_progs -n 212/79 [ 23.232282] ================================================================== [ 23.232634] BUG: KASAN: slab-use-after-free in sock_def_readable+0xe3/0x400 [ 23.232942] Read of size 8 at addr ffff8881013f9860 by task kworker/1:2/220 [ 23.233253] [ 23.233326] CPU: 1 PID: 220 Comm: kworker/1:2 Tainted: G OE 6.6.0 #30 [ 23.233697] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 [ 23.234074] Workqueue: events sk_psock_backlog [ 23.234271] Call Trace: [ 23.234381] <TASK> [ 23.234477] dump_stack_lvl+0x4a/0x90 [ 23.234640] print_address_description.constprop.0+0x33/0x400 [ 23.234888] ? preempt_count_sub+0x13/0xc0 [ 23.235071] print_report+0xb6/0x260 [ 23.235228] ? kasan_complete_mode_report_info+0x7c/0x1f0 [ 23.235462] kasan_report+0xd0/0x110 [ 23.235619] ? sock_def_readable+0xe3/0x400 [ 23.235801] ? sock_def_readable+0xe3/0x400 [ 23.235989] kasan_check_range+0xf7/0x1b0 [ 23.236164] __kasan_check_read+0x11/0x20 [ 23.236340] sock_def_readable+0xe3/0x400 [ 23.236514] unix_stream_sendmsg+0x3c5/0x7d0 [ 23.236704] ? queue_oob+0x300/0x300 [ 23.236865] sock_sendmsg+0x229/0x250 [ 23.237030] ? sock_write_iter+0x320/0x320 [ 23.237211] ? __this_cpu_preempt_check+0x13/0x20 [ 23.237416] ? lock_acquire+0x191/0x410 [ 23.237607] ? lock_sync+0x110/0x110 [ 23.237766] ? lock_is_held_type+0xd0/0x130 [ 23.237948] ? __asan_storeN+0x12/0x20 [ 23.238115] __skb_send_sock+0x4fa/0x670 [ 23.238288] ? preempt_count_sub+0x13/0xc0 [ 23.238494] ? sendmsg_locked+0x90/0x90 [ 23.238721] ? sendmsg_unlocked+0x40/0x40 [ 23.238975] ? __lock_acquire+0x765/0xf00 [ 23.239252] ? __this_cpu_preempt_check+0x13/0x20 [ 23.239570] ? lock_acquire+0x191/0x410 [ 23.239831] skb_send_sock+0x10/0x20 [ 23.240079] sk_psock_backlog+0x141/0x5e0 [ 23.240340] ? __this_cpu_preempt_check+0x13/0x20 [ 23.240638] process_one_work+0x49d/0x970 [ 23.240900] ? drain_workqueue+0x1c0/0x1c0 [ 23.241173] ? assign_work+0xe1/0x120 [ 23.241404] worker_thread+0x380/0x680 [ 23.241660] ? trace_hardirqs_on+0x22/0x100 [ 23.241933] ? preempt_count_sub+0x13/0xc0 [ 23.242213] ? process_one_work+0x970/0x970 [ 23.242491] kthread+0x1ba/0x200 [ 23.242687] ? kthread+0xfe/0x200 [ 23.242890] ? kthread_complete_and_exit+0x20/0x20 [ 23.243193] ret_from_fork+0x35/0x60 [ 23.243418] ? kthread_complete_and_exit+0x20/0x20 [ 23.243718] ret_from_fork_asm+0x11/0x20 [ 23.243995] </TASK> [ 23.244145] [ 23.244227] Allocated by task 227: [ 23.244446] kasan_save_stack+0x26/0x50 [ 23.244709] kasan_set_track+0x25/0x40 [ 23.244951] kasan_save_alloc_info+0x1e/0x30 [ 23.245220] __kasan_slab_alloc+0x72/0x80 [ 23.245491] kmem_cache_alloc+0x182/0x360 [ 23.245758] sk_prot_alloc+0x43/0x160 [ 23.246007] sk_alloc+0x2c/0x3a0 [ 23.246216] unix_create1+0x86/0x440 [ 23.246462] unix_create+0x7d/0x100 [ 23.246701] __sock_create+0x1bc/0x420 [ 23.246960] __sys_socketpair+0x1ac/0x3a0 [ 23.247237] __x64_sys_socketpair+0x4f/0x60 [ 23.247521] do_syscall_64+0x38/0x90 [ 23.247769] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 23.248113] [ 23.248225] Freed by task 227: [ 23.248444] kasan_save_stack+0x26/0x50 [ 23.248707] kasan_set_track+0x25/0x40 [ 23.248963] kasan_save_free_info+0x2b/0x50 [ 23.249249] ____kasan_slab_free+0x154/0x1c0 [ 23.249541] __kasan_slab_free+0x12/0x20 [ 23.249810] kmem_cache_free+0x1e7/0x480 [ 23.250084] __sk_destruct+0x270/0x3f0 [ 23.250342] sk_destruct+0x78/0x90 [ 23.250577] __sk_free+0x51/0x160 [ 23.250807] sk_free+0x45/0x70 [ 23.251025] unix_release_sock+0x5cc/0x700 [ 23.251301] unix_release+0x50/0x70 [ 23.251536] __sock_release+0x5f/0x120 [ 23.251754] sock_close+0x13/0x20 [ 23.252109] __fput+0x1f3/0x470 [ 23.252451] __fput_sync+0x2f/0x40 [ 23.252811] __x64_sys_close+0x51/0x90 [ 23.253169] do_syscall_64+0x38/0x90 [ 23.253480] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 23.253940] [ 23.254097] The buggy address belongs to the object at ffff8881013f9800 [ 23.254097] which belongs to the cache UNIX-STREAM of size 1920 [ 23.254936] The buggy address is located 96 bytes inside of [ 23.254936] freed 1920-byte region [ffff8881013f9800, ffff8881013f9f80) [ 23.255731] [ 23.255844] The buggy address belongs to the physical page: [ 23.256225] page:00000000c005ecb3 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff8881013f8000 pfn:0x1013f8 [ 23.256905] head:00000000c005ecb3 order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 23.257382] flags: 0x2fffe000000840(slab|head|node=0|zone=2|lastcpupid=0x7fff) [ 23.257791] page_type: 0xffffffff() [ 23.257988] raw: 002fffe000000840 ffff888100961a40 dead000000000122 0000000000000000 [ 23.258418] raw: ffff8881013f8000 000000008010000e 00000001ffffffff 0000000000000000 [ 23.258817] page dumped because: kasan: bad access detected [ 23.259131] [ 23.259205] Memory state around the buggy address: [ 23.259469] ffff8881013f9700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 23.259871] ffff8881013f9780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 23.260290] >ffff8881013f9800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 23.260704] ^ [ 23.261056] ffff8881013f9880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 23.261469] ffff8881013f9900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 23.261854] ================================================================== [ 23.262453] Disabling lock debugging due to kernel taint #212/79 sockmap_listen/sockmap VSOCK test_vsock_redir:OK #212 sockmap_listen:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED bash-5.2# If I modify the test to use (AF_UNIX, SOCK_DGRAM) instead of SOCK_STREAM, the bug no longer reproduces. Which confirms my thinking that unix_dgram_sendmsg is safe to use from sockmap because it grabs a ref to skpair.