On Thu, Jan 13, 2022 at 08:02 AM CET, menglong8.dong@xxxxxxxxx wrote: > From: Menglong Dong <imagedong@xxxxxxxxxxx> > > The description of 'dst_port' in 'struct bpf_sock' is not accurated. > In fact, 'dst_port' is not in network byte order, it is 'partly' in > network byte order. > > We can see it in bpf_sock_convert_ctx_access(): > >> case offsetof(struct bpf_sock, dst_port): >> *insn++ = BPF_LDX_MEM( >> BPF_FIELD_SIZEOF(struct sock_common, skc_dport), >> si->dst_reg, si->src_reg, >> bpf_target_off(struct sock_common, skc_dport, >> sizeof_field(struct sock_common, >> skc_dport), >> target_size)); > > It simply passes 'sock_common->skc_dport' to 'bpf_sock->dst_port', > which makes that the low 16-bits of 'dst_port' is equal to 'skc_port' > and is in network byte order, but the high 16-bites of 'dst_port' is > 0. And the actual port is 'bpf_ntohs((__u16)dst_port)', and > 'bpf_ntohl(dst_port)' is totally not the right port. > > This is different form 'remote_port' in 'struct bpf_sock_ops' or > 'struct __sk_buff': > >> case offsetof(struct __sk_buff, remote_port): >> BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); >> >> *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), >> si->dst_reg, si->src_reg, >> offsetof(struct sk_buff, sk)); >> *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, >> bpf_target_off(struct sock_common, >> skc_dport, >> 2, target_size)); >> #ifndef __BIG_ENDIAN_BITFIELD >> *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); >> #endif > > We can see that it will left move 16-bits in little endian, which makes > the whole 'remote_port' is in network byte order, and the actual port > is bpf_ntohl(remote_port). > > Note this in the document of 'dst_port'. ( Maybe this should be unified > in the code? ) > > Signed-off-by: Menglong Dong <imagedong@xxxxxxxxxxx> > --- > include/uapi/linux/bpf.h | 6 +++++- > 1 file changed, 5 insertions(+), 1 deletion(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index b0383d371b9a..891a182a749a 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -5500,7 +5500,11 @@ struct bpf_sock { > __u32 src_ip4; > __u32 src_ip6[4]; > __u32 src_port; /* host byte order */ > - __u32 dst_port; /* network byte order */ > + __u32 dst_port; /* low 16-bits are in network byte order, > + * and high 16-bits are filled by 0. > + * So the real port in host byte order is > + * bpf_ntohs((__u16)dst_port). > + */ > __u32 dst_ip4; > __u32 dst_ip6[4]; > __u32 state; I'm probably missing something obvious, but is there anything stopping us from splitting the field, so that dst_ports is 16-bit wide? I gave a quick check to the change below and it seems to pass verifier checks and sock_field tests. IDK, just an idea. Didn't give it a deeper thought. --8<-- diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a2f7041ebae..344d62ccafba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5574,7 +5574,8 @@ struct bpf_sock { __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ - __u32 dst_port; /* network byte order */ + __u16 unused; + __u16 dst_port; /* network byte order */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; diff --git a/net/core/filter.c b/net/core/filter.c index a06931c27eeb..c56b8ba82de5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8276,7 +8276,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): - case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): @@ -8285,6 +8284,9 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sock, dst_port): + bpf_ctx_record_field_size(info, sizeof(__u16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); } return size == size_default; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4a2f7041ebae..344d62ccafba 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5574,7 +5574,8 @@ struct bpf_sock { __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ - __u32 dst_port; /* network byte order */ + __u16 unused; + __u16 dst_port; /* network byte order */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state;