Re: [bpf-next PATCH 1/3] bpf: refactor sockmap redirect code so its easy to reuse

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, 29 May 2020 23:06:41 +0000
John Fastabend <john.fastabend@xxxxxxxxx> wrote:

> We will need this block of code called from tls context shortly
> lets refactor the redirect logic so its easy to use. This also
> cleans up the switch stmt so we have fewer fallthrough cases.
> 
> No logic changes are intended.
> 
> Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface")
> Signed-off-by: John Fastabend <john.fastabend@xxxxxxxxx>
> ---
>  net/core/skmsg.c |   55 +++++++++++++++++++++++++++++++++---------------------
>  1 file changed, 34 insertions(+), 21 deletions(-)
> 
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index c479372..9d72f71 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -682,13 +682,43 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
>  	return container_of(parser, struct sk_psock, parser);
>  }
>  
> -static void sk_psock_verdict_apply(struct sk_psock *psock,
> -				   struct sk_buff *skb, int verdict)
> +static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb)
>  {
>  	struct sk_psock *psock_other;
>  	struct sock *sk_other;
>  	bool ingress;
>  
> +	sk_other = tcp_skb_bpf_redirect_fetch(skb);
> +	if (unlikely(!sk_other)) {
> +		kfree_skb(skb);
> +		return;
> +	}
> +	psock_other = sk_psock(sk_other);

I think we're not in RCU read-side critical section and so lockdep-RCU
[0] is complaining about accessing sk_user_data with rcu_dereference:

bash-5.0# ./test_sockmap
# 1/ 6  sockmap::txmsg test passthrough:OK
# 2/ 6  sockmap::txmsg test redirect:OK
# 3/ 6  sockmap::txmsg test drop:OK
# 4/ 6  sockmap::txmsg test ingress redirect:OK
[   96.791996]
[   96.792211] =============================
[   96.792763] WARNING: suspicious RCU usage
[   96.793297] 5.7.0-rc7-02964-g615b5749876a-dirty #692 Not tainted
[   96.794032] -----------------------------
[   96.794480] include/linux/skmsg.h:284 suspicious rcu_dereference_check() usage!
[   96.795154]
[   96.795154] other info that might help us debug this:
[   96.795154]
[   96.795926]
[   96.795926] rcu_scheduler_active = 2, debug_locks = 1
[   96.796556] 1 lock held by test_sockmap/1060:
[   96.796970]  #0: ffff8880a0d35f20 (sk_lock-AF_INET){+.+.}-{0:0}, at: tls_sw_recvmsg+0x238/0xc10
[   96.797813]
[   96.797813] stack backtrace:
[   96.798224] CPU: 1 PID: 1060 Comm: test_sockmap Not tainted 5.7.0-rc7-02964-g615b5749876a-dirty #692
[   96.799071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[   96.800294] Call Trace:
[   96.800543]  dump_stack+0x97/0xe0
[   96.800864]  sk_psock_skb_redirect.isra.0+0xa6/0x1b0
[   96.801338]  sk_psock_tls_strp_read+0x298/0x310
[   96.801769]  tls_sw_recvmsg+0xa47/0xc10
[   96.802144]  ? decrypt_skb+0x80/0x80
[   96.802491]  ? lock_downgrade+0x330/0x330
[   96.802887]  inet_recvmsg+0xae/0x2a0
[   96.803224]  ? rw_copy_check_uvector+0x15e/0x1b0
[   96.803669]  ? inet_sendpage+0xc0/0xc0
[   96.804029]  ____sys_recvmsg+0x110/0x210
[   96.804417]  ? kernel_recvmsg+0x60/0x60
[   96.804785]  ? copy_msghdr_from_user+0x91/0xd0
[   96.805200]  ? __copy_msghdr_from_user+0x230/0x230
[   96.805665]  ? lock_acquire+0x120/0x4b0
[   96.806025]  ? match_held_lock+0x1b/0x230
[   96.806417]  ___sys_recvmsg+0xb8/0x100
[   96.806778]  ? ___sys_sendmsg+0x110/0x110
[   96.807155]  ? lock_downgrade+0x330/0x330
[   96.807555]  ? __fget_light+0xad/0x110
[   96.807917]  ? sockfd_lookup_light+0x91/0xb0
[   96.808334]  __sys_recvmsg+0x87/0xe0
[   96.808674]  ? __sys_recvmsg_sock+0x70/0x70
[   96.809064]  ? rcu_read_lock_sched_held+0x81/0xb0
[   96.809540]  ? switch_fpu_return+0x1/0x250
[   96.809948]  ? do_syscall_64+0x5f/0x9a0
[   96.810325]  do_syscall_64+0xad/0x9a0
[   96.810676]  ? handle_mm_fault+0x21e/0x3d0
[   96.811060]  ? syscall_return_slowpath+0x530/0x530
[   96.811524]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[   96.811955]  ? lockdep_hardirqs_off+0xb5/0x100
[   96.812383]  ? trace_hardirqs_off_thunk+0x1a/0x1c
[   96.812871]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
[   96.813425] RIP: 0033:0x7f92ff15c737
[   96.813762] Code: 02 b8 ff ff ff ff eb bd 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2f 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10
[   96.815477] RSP: 002b:00007ffd80734b68 EFLAGS: 00000246 ORIG_RAX: 000000000000002f
[   96.816177] RAX: ffffffffffffffda RBX: 000000000000001c RCX: 00007f92ff15c737
[   96.816847] RDX: 0000000000004000 RSI: 00007ffd80734bd0 RDI: 000000000000001c
[   96.817602] RBP: 00007ffd80734c50 R08: 00007ffd80734bc0 R09: 0000000000000060
[   96.818351] R10: fffffffffffffb15 R11: 0000000000000246 R12: 0000000000000000
[   96.819107] R13: 0000000000004000 R14: 00007f92fef7a6b8 R15: 00007ffd80734d50

[0] https://www.kernel.org/doc/Documentation/RCU/lockdep-splat.txt

> +	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
> +	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
> +		kfree_skb(skb);
> +		return;
> +	}
> +
> +	ingress = tcp_skb_bpf_ingress(skb);
> +	if ((!ingress && sock_writeable(sk_other)) ||
> +	    (ingress &&
> +	     atomic_read(&sk_other->sk_rmem_alloc) <=
> +	     sk_other->sk_rcvbuf)) {
> +		if (!ingress)
> +			skb_set_owner_w(skb, sk_other);
> +		skb_queue_tail(&psock_other->ingress_skb, skb);
> +		schedule_work(&psock_other->work);
> +	} else {
> +		kfree_skb(skb);
> +	}
> +}
> +
> +static void sk_psock_verdict_apply(struct sk_psock *psock,
> +				   struct sk_buff *skb, int verdict)
> +{
> +	struct sock *sk_other;
> +
>  	switch (verdict) {
>  	case __SK_PASS:
>  		sk_other = psock->sk;
> @@ -707,25 +737,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
>  		}
>  		goto out_free;
>  	case __SK_REDIRECT:
> -		sk_other = tcp_skb_bpf_redirect_fetch(skb);
> -		if (unlikely(!sk_other))
> -			goto out_free;
> -		psock_other = sk_psock(sk_other);
> -		if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
> -		    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
> -			goto out_free;
> -		ingress = tcp_skb_bpf_ingress(skb);
> -		if ((!ingress && sock_writeable(sk_other)) ||
> -		    (ingress &&
> -		     atomic_read(&sk_other->sk_rmem_alloc) <=
> -		     sk_other->sk_rcvbuf)) {
> -			if (!ingress)
> -				skb_set_owner_w(skb, sk_other);
> -			skb_queue_tail(&psock_other->ingress_skb, skb);
> -			schedule_work(&psock_other->work);
> -			break;
> -		}
> -		/* fall-through */
> +		sk_psock_skb_redirect(psock, skb);
> +		break;
>  	case __SK_DROP:
>  		/* fall-through */
>  	default:
> 



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux