Re: [PATCH bpf-next 2/2] selftest/bpf: Implement sample UNIX domain

Kuniyuki Iwashima <kuniyu@xxxxxxxxxxxx> · Sat, 31 Jul 2021 07:55:09 +0900

From:   Yonghong Song <yhs@xxxxxx>
Date:   Fri, 30 Jul 2021 09:22:32 -0700
> On 7/30/21 12:58 AM, Kuniyuki Iwashima wrote:
> > From:   Yonghong Song <yhs@xxxxxx>
> > Date:   Thu, 29 Jul 2021 23:54:26 -0700
> >> On 7/29/21 4:36 PM, Kuniyuki Iwashima wrote:
> >>> If there are no abstract sockets, this prog can output the same result
> >>> compared to /proc/net/unix.
> >>>
> >>>     # cat /sys/fs/bpf/unix | head -n 2
> >>>     Num       RefCount Protocol Flags    Type St Inode Path
> >>>     ffff9ab7122db000: 00000002 00000000 00010000 0001 01 10623 private/defer
> >>>
> >>>     # cat /proc/net/unix | head -n 2
> >>>     Num       RefCount Protocol Flags    Type St Inode Path
> >>>     ffff9ab7122db000: 00000002 00000000 00010000 0001 01 10623 private/defer
> >>>
> >>> Signed-off-by: Kuniyuki Iwashima <kuniyu@xxxxxxxxxxxx>
> >>> ---
> >>>    .../selftests/bpf/prog_tests/bpf_iter.c       | 17 +++++
> >>>    .../selftests/bpf/progs/bpf_iter_unix.c       | 75 +++++++++++++++++++
> >>>    2 files changed, 92 insertions(+)
> >>>    create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_unix.c
> >>>
> >>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> >>> index 1f1aade56504..4746bac68d36 100644
> >>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> >>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> >>> @@ -13,6 +13,7 @@
> >>>    #include "bpf_iter_tcp6.skel.h"
> >>>    #include "bpf_iter_udp4.skel.h"
> >>>    #include "bpf_iter_udp6.skel.h"
> >>> +#include "bpf_iter_unix.skel.h"
> >>>    #include "bpf_iter_test_kern1.skel.h"
> >>>    #include "bpf_iter_test_kern2.skel.h"
> >>>    #include "bpf_iter_test_kern3.skel.h"
> >>> @@ -313,6 +314,20 @@ static void test_udp6(void)
> >>>    	bpf_iter_udp6__destroy(skel);
> >>>    }
> >>>    
> >>> +static void test_unix(void)
> >>> +{
> >>> +	struct bpf_iter_unix *skel;
> >>> +
> >>> +	skel = bpf_iter_unix__open_and_load();
> >>> +	if (CHECK(!skel, "bpf_iter_unix__open_and_load",
> >>> +		  "skeleton open_and_load failed\n"))
> >>> +		return;
> >>> +
> >>> +	do_dummy_read(skel->progs.dump_unix);
> >>> +
> >>> +	bpf_iter_unix__destroy(skel);
> >>> +}
> >>> +
> >>>    /* The expected string is less than 16 bytes */
> >>>    static int do_read_with_fd(int iter_fd, const char *expected,
> >>>    			   bool read_one_char)
> >>> @@ -1255,6 +1270,8 @@ void test_bpf_iter(void)
> >>>    		test_udp4();
> >>>    	if (test__start_subtest("udp6"))
> >>>    		test_udp6();
> >>> +	if (test__start_subtest("unix"))
> >>> +		test_unix();
> >>>    	if (test__start_subtest("anon"))
> >>>    		test_anon_iter(false);
> >>>    	if (test__start_subtest("anon-read-one-char"))
> >>> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
> >>> new file mode 100644
> >>> index 000000000000..285ec2f7944d
> >>> --- /dev/null
> >>> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c
> >>> @@ -0,0 +1,75 @@
> >>> +// SPDX-License-Identifier: GPL-2.0
> >>> +/* Copyright Amazon.com Inc. or its affiliates. */
> >>> +#include "bpf_iter.h"
> >>
> >> Could you add bpf_iter__unix to bpf_iter.h similar to bpf_iter__sockmap?
> >> The main purpose is to make test tolerating with old vmlinux.h.
> > 
> > Thank you for explanation!
> > I've understood why it is needed even when the same struct is defined.
> > I'll add it in the next spin.
> > 
> > 
> >>
> >>> +#include "bpf_tracing_net.h"
> >>> +#include <bpf/bpf_helpers.h>
> >>> +#include <bpf/bpf_endian.h>
> >>> +
> >>> +char _license[] SEC("license") = "GPL";
> >>> +
> >>> +#define __SO_ACCEPTCON		(1 << 16)
> >>> +#define UNIX_HASH_SIZE		256
> >>> +#define UNIX_ABSTRACT(unix_sk)	(unix_sk->addr->hash < UNIX_HASH_SIZE)
> >>
> >> Could you add the above three define's in bpf_tracing_net.h?
> >> We try to keep all these common defines in a common header for
> >> potential reusability.
> > 
> > Will do.
> > 
> > 
> >>
> >>> +
> >>> +static long sock_i_ino(const struct sock *sk)
> >>> +{
> >>> +	const struct socket *sk_socket = sk->sk_socket;
> >>> +	const struct inode *inode;
> >>> +	unsigned long ino;
> >>> +
> >>> +	if (!sk_socket)
> >>> +		return 0;
> >>> +
> >>> +	inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
> >>> +	bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
> >>> +	return ino;
> >>> +}
> >>> +
> >>> +SEC("iter/unix")
> >>> +int dump_unix(struct bpf_iter__unix *ctx)
> >>> +{
> >>> +	struct unix_sock *unix_sk = ctx->unix_sk;
> >>> +	struct sock *sk = (struct sock *)unix_sk;
> >>> +	struct seq_file *seq;
> >>> +	__u32 seq_num;
> >>> +
> >>> +	if (!unix_sk)
> >>> +		return 0;
> >>> +
> >>> +	seq = ctx->meta->seq;
> >>> +	seq_num = ctx->meta->seq_num;
> >>> +	if (seq_num == 0)
> >>> +		BPF_SEQ_PRINTF(seq, "Num       RefCount Protocol Flags    "
> >>> +			       "Type St Inode Path\n");
> >>> +
> >>> +	BPF_SEQ_PRINTF(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
> >>> +		       unix_sk,
> >>> +		       sk->sk_refcnt.refs.counter,
> >>> +		       0,
> >>> +		       sk->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
> >>> +		       sk->sk_type,
> >>> +		       sk->sk_socket ?
> >>> +		       (sk->sk_state == TCP_ESTABLISHED ?
> >>> +			SS_CONNECTED : SS_UNCONNECTED) :
> >>> +		       (sk->sk_state == TCP_ESTABLISHED ?
> >>> +			SS_CONNECTING : SS_DISCONNECTING),
> >>> +		       sock_i_ino(sk));
> >>> +
> >>> +	if (unix_sk->addr) {
> >>> +		if (UNIX_ABSTRACT(unix_sk))
> >>> +			/* Abstract UNIX domain socket can contain '\0' in
> >>> +			 * the path, and it should be escaped.  However, it
> >>> +			 * requires loops and the BPF verifier rejects it.
> >>> +			 * So here, print only the escaped first byte to
> >>> +			 * indicate it is an abstract UNIX domain socket.
> >>> +			 * (See: unix_seq_show() and commit e7947ea770d0d)
> >>> +			 */
> >>> +			BPF_SEQ_PRINTF(seq, " @");
> >>> +		else
> >>> +			BPF_SEQ_PRINTF(seq, " %s", unix_sk->addr->name->sun_path);
> >>> +	}
> >>
> >> I looked at af_unix.c, for the above "if (unix_sk->addr) { ... }" code,
> >> the following is the kernel source code,
> >>
> >>                   if (u->addr) {  // under unix_table_lock here
> >>                           int i, len;
> >>                           seq_putc(seq, ' ');
> >>
> >>                           i = 0;
> >>                           len = u->addr->len - sizeof(short);
> >>                           if (!UNIX_ABSTRACT(s))
> >>                                   len--;
> >>                           else {
> >>                                   seq_putc(seq, '@');
> >>                                   i++;
> >>                           }
> >>                           for ( ; i < len; i++)
> >>                                   seq_putc(seq, u->addr->name->sun_path[i] ?:
> >>                                            '@');
> >>                   }
> >>
> >> It does not seem to match bpf program non UNIX_ABSTRACT case.
> >> I am not familiar with unix socket so it would be good if you can
> >> explain a little more.
> > 
> > There is three kinds of unix sockets: pathname, unnamed, abstract.  The
> > first two terminate the addr with `\0`, but abstract must start with `\0`
> > and can contain `\0` anywhere in addr.  The `\0` in addr of abstract socket
> > does not have special meaning. [1]
> > 
> > They are inserted into the same hash table in unix_bind(), so the bpf prog
> > matches all of them.
> > 
> > ``` net/unix/af_unix.c
> >    1114		if (sun_path[0])
> >    1115			err = unix_bind_bsd(sk, addr);
> >    1116		else
> >    1117			err = unix_bind_abstract(sk, addr);
> > ```
> > 
> > [1]: https://man7.org/linux/man-pages/man7/unix.7.html
> > 
> > 
> >>
> >> For verifier issue with loops, do we have a maximum upper bound for
> >> u->addr->len? If yes, does bounded loop work?
> > 
> > That has a maximum length in unix_mkname(): sizeof(struct sockaddr_un).
> > 
> > ``` net/unix/af_unix.c
> >     223	/*
> >     224	 *	Check unix socket name:
> >     225	 *		- should be not zero length.
> >     226	 *	        - if started by not zero, should be NULL terminated (FS object)
> >     227	 *		- if started by zero, it is abstract name.
> >     228	 */
> >     229	
> >     230	static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
> >     231	{
> > ...
> >     234		if (len <= sizeof(short) || len > sizeof(*sunaddr))
> >     235			return -EINVAL;
> > ...
> >     253	}
> > ```
> > 
> > So, I rewrote the test like this, but it still causes an error.
> > 
> > ```
> > 	if (unix_sk->addr) {
> > 		int i, len;
> > 
> > 		len = unix_sk->addr->len - sizeof(short);
> > 
> > 		if (!UNIX_ABSTRACT(unix_sk)) {
> > 			BPF_SEQ_PRINTF(seq, " %s", unix_sk->addr->name->sun_path);
> > 		} else {
> > 			BPF_SEQ_PRINTF(seq, " @");
> > 			i++;
> 
> i++ is not useful here and "i" is not initialized earlier.

Good catch.
I'll remove this.

> 
> > 
> > 			if (len < sizeof(struct sockaddr_un)) {
> > 				for (i = 1 ; i < len; i++)
> > 					BPF_SEQ_PRINTF(seq, "%c",
> > 						       unix_sk->addr->name->sun_path[i] ?:
> > 						       '@');
> > 			}
> > 		}
> > 	}
> > ```
> > 
> > ```
> > processed 196505 insns (limit 1000000) max_states_per_insn 4 total_states 1830 peak_states 1830 mark_read 3
> > ```
> 
> I did some debugging, the main reason is that llvm compiler used "!=" 
> instead of "<" for "i < len" comparison.
> 
>       107:       b4 05 00 00 08 00 00 00 w5 = 8
>       108:       85 00 00 00 7e 00 00 00 call 126
> ;                               for (i = 1 ; i < len; i++)
>       109:       07 09 00 00 01 00 00 00 r9 += 1
>       110:       5d 98 09 00 00 00 00 00 if r8 != r9 goto +9 <LBB0_18>
> 
> 
> Considering "len" is not a constant, for verifier, r8 will never be 
> equal to r9 in the above.
> 
> Digging into llvm compilation, it is llvm pass Induction Variable 
> simplication pass made this code change. I will try to dig more and
> find a solution. In your next revision, could you add the above code
> as a comment so once llvm code gen is improved, we can have proper
> implementation to match /proc/net/unix?

Thanks for debugging!
Ok, I'll include the code and add some note and this thread link in the
commit log.

> 
> > 
> >>
> >>> +
> >>> +	BPF_SEQ_PRINTF(seq, "\n");
> >>> +
> >>> +	return 0;
> >>> +}