Jakub Sitnicki wrote: > On Wed, Nov 23, 2022 at 02:01 PM +08, Pengcheng Yang wrote: > > John Fastabend <john.fastabend@xxxxxxxxx> wrote: > >> > >> Pengcheng Yang wrote: > >> > When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS > >> > flag from the msg->flags. If apply_bytes is used and it is larger than > >> > the current data being processed, sk_psock_msg_verdict() will not be > >> > called when sendmsg() is called again. At this time, the msg->flags is 0, > >> > and we lost the BPF_F_INGRESS flag. > >> > > >> > So we need to save the BPF_F_INGRESS flag in sk_psock and assign it to > >> > msg->flags before redirection. > >> > > >> > Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") > >> > Signed-off-by: Pengcheng Yang <yangpc@xxxxxxxxxx> > >> > --- > >> > include/linux/skmsg.h | 1 + > >> > net/core/skmsg.c | 1 + > >> > net/ipv4/tcp_bpf.c | 1 + > >> > net/tls/tls_sw.c | 1 + > >> > 4 files changed, 4 insertions(+) > >> > > >> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > >> > index 48f4b64..e1d463f 100644 > >> > --- a/include/linux/skmsg.h > >> > +++ b/include/linux/skmsg.h > >> > @@ -82,6 +82,7 @@ struct sk_psock { > >> > u32 apply_bytes; > >> > u32 cork_bytes; > >> > u32 eval; > >> > + u32 flags; > >> > struct sk_msg *cork; > >> > struct sk_psock_progs progs; > >> > #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) > >> > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > >> > index 188f855..ab2f8f3 100644 > >> > --- a/net/core/skmsg.c > >> > +++ b/net/core/skmsg.c > >> > @@ -888,6 +888,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, > >> > if (psock->sk_redir) > >> > sock_put(psock->sk_redir); > >> > psock->sk_redir = msg->sk_redir; > >> > + psock->flags = msg->flags; > >> > if (!psock->sk_redir) { > >> > ret = __SK_DROP; > >> > goto out; > >> > diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c > >> > index ef5de4f..1390d72 100644 > >> > --- a/net/ipv4/tcp_bpf.c > >> > +++ b/net/ipv4/tcp_bpf.c > >> > @@ -323,6 +323,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, > >> > break; > >> > case __SK_REDIRECT: > >> > sk_redir = psock->sk_redir; > >> > + msg->flags = psock->flags; > >> > sk_msg_apply_bytes(psock, tosend); > >> > if (!psock->apply_bytes) { > >> > /* Clean up before releasing the sock lock. */ > >> ^^^^^^^^^^^^^^^ > >> In this block reposted here with the rest of the block > >> > >> > >> if (!psock->apply_bytes) { > >> /* Clean up before releasing the sock lock. */ > >> eval = psock->eval; > >> psock->eval = __SK_NONE; > >> psock->sk_redir = NULL; > >> } > >> > >> Now that we have a psock->flags we should clera that as > >> well right? > > > > According to my understanding, it is not necessary (but can) to clear > > psock->flags here, because psock->flags will be overwritten by msg->flags > > at the beginning of each redirection (in sk_psock_msg_verdict()). > > 1. We should at least document that psock->flags value can be garbage > (undefined) if psock->sk_redir is null. Per v2 I think we should not have garbage flags. Just zero the flags field no point in saving a single insn here IMO. > > 2. 'flags' is amiguous (flags for what?). I'd suggest to rename to > something like redir_flags. > > Also, since we don't care about all flags, but just the ingress bit, > we should store just that. It's not about size. Less state passed > around is easier to reason about. See patch below. rename makes sense to me. > > 3. Alternatively, I'd turn psock->sk_redir into a tagged pointer, like > skb->_sk_redir is. This way all state (pointer & flags) is bundled > and managed together. It would be a bigger change. Would have to be > split out from this patch set. > > --8<-- > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > index 70d6cb94e580..84f787416a54 100644 > --- a/include/linux/skmsg.h > +++ b/include/linux/skmsg.h > @@ -82,6 +82,7 @@ struct sk_psock { > u32 apply_bytes; > u32 cork_bytes; > u32 eval; > + bool redir_ingress; /* undefined if sk_redir is null */ > struct sk_msg *cork; > struct sk_psock_progs progs; > #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) > diff --git a/include/net/tcp.h b/include/net/tcp.h > index 14d45661a84d..5b70b241ce71 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -2291,8 +2291,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); > void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); > #endif /* CONFIG_BPF_SYSCALL */ > > -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, > - int flags); > +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, > + struct sk_msg *msg, u32 bytes, int flags); > #endif /* CONFIG_NET_SOCK_MSG */ > > #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > index e6b9ced3eda8..53d0251788aa 100644 > --- a/net/core/skmsg.c > +++ b/net/core/skmsg.c > @@ -886,13 +886,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, > ret = sk_psock_map_verd(ret, msg->sk_redir); > psock->apply_bytes = msg->apply_bytes; > if (ret == __SK_REDIRECT) { > - if (psock->sk_redir) > + if (psock->sk_redir) { > sock_put(psock->sk_redir); > - psock->sk_redir = msg->sk_redir; > - if (!psock->sk_redir) { > + psock->sk_redir = NULL; > + } > + if (!msg->sk_redir) { > ret = __SK_DROP; > goto out; > } > + psock->redir_ingress = sk_msg_to_ingress(msg); > + psock->sk_redir = msg->sk_redir; > sock_hold(psock->sk_redir); > } > out: > diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c > index cf9c3e8f7ccb..490b359dc814 100644 > --- a/net/ipv4/tcp_bpf.c > +++ b/net/ipv4/tcp_bpf.c > @@ -131,10 +131,9 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, > return ret; > } > > -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, > - u32 bytes, int flags) > +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, > + struct sk_msg *msg, u32 bytes, int flags) > { > - bool ingress = sk_msg_to_ingress(msg); > struct sk_psock *psock = sk_psock_get(sk); > int ret; > > @@ -337,7 +336,8 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, > release_sock(sk); > > origsize = msg->sg.size; > - ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); > + ret = tcp_bpf_sendmsg_redir(sk_redir, psock->redir_ingress, > + msg, tosend, flags); > sent = origsize - msg->sg.size; > > if (eval == __SK_REDIRECT) > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c > index 264cf367e265..b22d97610b9a 100644 > --- a/net/tls/tls_sw.c > +++ b/net/tls/tls_sw.c > @@ -846,7 +846,8 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, > sk_msg_return_zero(sk, msg, send); > msg->sg.size -= send; > release_sock(sk); > - err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); > + err = tcp_bpf_sendmsg_redir(sk_redir, psock->redir_ingress, > + &msg_redir, send, flags); > lock_sock(sk); > if (err < 0) { > *copied -= sk_msg_free_nocharge(sk, &msg_redir);