On 12/13/19 4:47 PM, Martin KaFai Lau wrote: > This patch makes "struct tcp_congestion_ops" to be the first user > of BPF STRUCT_OPS. It allows implementing a tcp_congestion_ops > in bpf. > > The BPF implemented tcp_congestion_ops can be used like > regular kernel tcp-cc through sysctl and setsockopt. e.g. > [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion > net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic > net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic > net.ipv4.tcp_congestion_control = bpf_cubic > > There has been attempt to move the TCP CC to the user space > (e.g. CCP in TCP). The common arguments are faster turn around, > get away from long-tail kernel versions in production...etc, > which are legit points. > > BPF has been the continuous effort to join both kernel and > userspace upsides together (e.g. XDP to gain the performance > advantage without bypassing the kernel). The recent BPF > advancements (in particular BTF-aware verifier, BPF trampoline, > BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc) > possible in BPF. It allows a faster turnaround for testing algorithm > in the production while leveraging the existing (and continue growing) > BPF feature/framework instead of building one specifically for > userspace TCP CC. > > This patch allows write access to a few fields in tcp-sock > (in bpf_tcp_ca_btf_struct_access()). > > The optional "get_info" is unsupported now. It can be added > later. One possible way is to output the info with a btf-id > to describe the content. > > Signed-off-by: Martin KaFai Lau <kafai@xxxxxx> > --- > include/linux/filter.h | 2 + > include/net/tcp.h | 1 + > kernel/bpf/bpf_struct_ops_types.h | 7 +- > net/core/filter.c | 2 +- > net/ipv4/Makefile | 4 + > net/ipv4/bpf_tcp_ca.c | 225 ++++++++++++++++++++++++++++++ > net/ipv4/tcp_cong.c | 14 +- > net/ipv4/tcp_ipv4.c | 6 +- > net/ipv4/tcp_minisocks.c | 4 +- > net/ipv4/tcp_output.c | 4 +- > 10 files changed, 254 insertions(+), 15 deletions(-) > create mode 100644 net/ipv4/bpf_tcp_ca.c > > diff --git a/include/linux/filter.h b/include/linux/filter.h > index 37ac7025031d..7c22c5e6528d 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -844,6 +844,8 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); > int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, > bpf_aux_classic_check_t trans, bool save_orig); > void bpf_prog_destroy(struct bpf_prog *fp); > +const struct bpf_func_proto * > +bpf_base_func_proto(enum bpf_func_id func_id); > > int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); > int sk_attach_bpf(u32 ufd, struct sock *sk); > diff --git a/include/net/tcp.h b/include/net/tcp.h > index 86b9a8766648..fd87fa1df603 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -1007,6 +1007,7 @@ enum tcp_ca_ack_event_flags { > #define TCP_CONG_NON_RESTRICTED 0x1 > /* Requires ECN/ECT set on all packets */ > #define TCP_CONG_NEEDS_ECN 0x2 > +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) > > union tcp_cc_info; > > diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h > index 7bb13ff49ec2..066d83ea1c99 100644 > --- a/kernel/bpf/bpf_struct_ops_types.h > +++ b/kernel/bpf/bpf_struct_ops_types.h > @@ -1,4 +1,9 @@ > /* SPDX-License-Identifier: GPL-2.0 */ > /* internal file - do not include directly */ > > -/* To be filled in a later patch */ > +#ifdef CONFIG_BPF_JIT > +#ifdef CONFIG_INET > +#include <net/tcp.h> > +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) > +#endif > +#endif > diff --git a/net/core/filter.c b/net/core/filter.c > index a411f7835dee..fbb3698026bd 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -5975,7 +5975,7 @@ bool bpf_helper_changes_pkt_data(void *func) > return false; > } > > -static const struct bpf_func_proto * > +const struct bpf_func_proto * > bpf_base_func_proto(enum bpf_func_id func_id) > { > switch (func_id) { > diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile > index d57ecfaf89d4..7360d9b3eaad 100644 > --- a/net/ipv4/Makefile > +++ b/net/ipv4/Makefile > @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o > > obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ > xfrm4_output.o xfrm4_protocol.o > + > +ifeq ($(CONFIG_BPF_SYSCALL),y) > +obj-$(CONFIG_BPF_JIT) += bpf_tcp_ca.o > +endif > diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c > new file mode 100644 > index 000000000000..967af987bc26 > --- /dev/null > +++ b/net/ipv4/bpf_tcp_ca.c > @@ -0,0 +1,225 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright (c) 2019 Facebook */ > + > +#include <linux/types.h> > +#include <linux/bpf_verifier.h> > +#include <linux/bpf.h> > +#include <linux/btf.h> > +#include <linux/filter.h> > +#include <net/tcp.h> > + > +static u32 optional_ops[] = { > + offsetof(struct tcp_congestion_ops, init), > + offsetof(struct tcp_congestion_ops, release), > + offsetof(struct tcp_congestion_ops, set_state), > + offsetof(struct tcp_congestion_ops, cwnd_event), > + offsetof(struct tcp_congestion_ops, in_ack_event), > + offsetof(struct tcp_congestion_ops, pkts_acked), > + offsetof(struct tcp_congestion_ops, min_tso_segs), > + offsetof(struct tcp_congestion_ops, sndbuf_expand), > + offsetof(struct tcp_congestion_ops, cong_control), > +}; > + > +static u32 unsupported_ops[] = { > + offsetof(struct tcp_congestion_ops, get_info), > +}; Could you elaborate by adding some comments at least how required fields are handled? > + > +static const struct btf_type *tcp_sock_type; > +static u32 tcp_sock_id, sock_id; > + > +static int bpf_tcp_ca_init(struct btf *_btf_vmlinux) > +{ > + s32 type_id; > + > + type_id = btf_find_by_name_kind(_btf_vmlinux, "sock", BTF_KIND_STRUCT); > + if (type_id < 0) > + return -EINVAL; > + sock_id = type_id; > + > + type_id = btf_find_by_name_kind(_btf_vmlinux, "tcp_sock", > + BTF_KIND_STRUCT); > + if (type_id < 0) > + return -EINVAL; > + tcp_sock_id = type_id; > + tcp_sock_type = btf_type_by_id(_btf_vmlinux, tcp_sock_id); > + > + return 0; > +} > + > +static bool check_optional(u32 member_offset) > +{ > + unsigned int i; > + > + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { > + if (member_offset == optional_ops[i]) > + return true; > + } > + > + return false; > +} > + > +static bool check_unsupported(u32 member_offset) > +{ > + unsigned int i; > + > + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { > + if (member_offset == unsupported_ops[i]) > + return true; > + } > + > + return false; > +} > + [...] > + > +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { > + .get_func_proto = bpf_tcp_ca_get_func_proto, > + .is_valid_access = bpf_tcp_ca_is_valid_access, > + .btf_struct_access = bpf_tcp_ca_btf_struct_access, > +}; > + > +static int bpf_tcp_ca_init_member(const struct btf_type *t, > + const struct btf_member *member, > + void *kdata, const void *udata) > +{ > + const struct tcp_congestion_ops *utcp_ca; > + struct tcp_congestion_ops *tcp_ca; > + size_t tcp_ca_name_len; > + int prog_fd; > + u32 moff; > + > + utcp_ca = (const struct tcp_congestion_ops *)udata; > + tcp_ca = (struct tcp_congestion_ops *)kdata; > + > + moff = btf_member_bit_offset(t, member) / 8; > + switch (moff) { > + case offsetof(struct tcp_congestion_ops, flags): > + if (utcp_ca->flags & ~TCP_CONG_MASK) > + return -EINVAL; > + tcp_ca->flags = utcp_ca->flags; > + return 1; > + case offsetof(struct tcp_congestion_ops, name): > + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); > + if (!tcp_ca_name_len || > + tcp_ca_name_len == sizeof(utcp_ca->name)) > + return -EINVAL; > + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); > + return 1; > + } > + > + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) > + return 0; > + > + prog_fd = (int)(*(unsigned long *)(udata + moff)); > + if (!prog_fd && !check_optional(moff) && !check_unsupported(moff)) > + return -EINVAL; So if a member is option or unsupported, we will return -EINVAL? I probably miss something here. > + > + return 0; > +} > + > +static int bpf_tcp_ca_check_member(const struct btf_type *t, > + const struct btf_member *member) > +{ > + if (check_unsupported(btf_member_bit_offset(t, member) / 8)) > + return -ENOTSUPP; > + return 0; > +} > + > +static int bpf_tcp_ca_reg(void *kdata) > +{ > + return tcp_register_congestion_control(kdata); > +} > + > +static void bpf_tcp_ca_unreg(void *kdata) > +{ > + tcp_unregister_congestion_control(kdata); > +} > + [...]