On Tue, Jan 23, 2024 at 3:51 PM Martin KaFai Lau <martin.lau@xxxxxxxxx> wrote: > > On 1/17/24 1:56 PM, Amery Hung wrote: > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index 0bb92414c036..df280bbb7c0d 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -997,6 +997,7 @@ enum bpf_prog_type { > > BPF_PROG_TYPE_SK_LOOKUP, > > BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ > > BPF_PROG_TYPE_NETFILTER, > > + BPF_PROG_TYPE_QDISC, > > }; > > > > enum bpf_attach_type { > > @@ -1056,6 +1057,8 @@ enum bpf_attach_type { > > BPF_CGROUP_UNIX_GETSOCKNAME, > > BPF_NETKIT_PRIMARY, > > BPF_NETKIT_PEER, > > + BPF_QDISC_ENQUEUE, > > + BPF_QDISC_DEQUEUE, > > __MAX_BPF_ATTACH_TYPE > > }; > > > > @@ -7357,4 +7360,22 @@ struct bpf_iter_num { > > __u64 __opaque[1]; > > } __attribute__((aligned(8))); > > > > +struct bpf_qdisc_ctx { > > + __bpf_md_ptr(struct sk_buff *, skb); > > + __u32 classid; > > + __u64 expire; > > + __u64 delta_ns; > > +}; > > + > > +enum { > > + SCH_BPF_QUEUED, > > + SCH_BPF_DEQUEUED = SCH_BPF_QUEUED, > > + SCH_BPF_DROP, > > + SCH_BPF_CN, > > + SCH_BPF_THROTTLE, > > + SCH_BPF_PASS, > > + SCH_BPF_BYPASS, > > + SCH_BPF_STOLEN, > > +}; > > + > > #endif /* _UAPI__LINUX_BPF_H__ */ > > [ ... ] > > > +static bool tc_qdisc_is_valid_access(int off, int size, > > + enum bpf_access_type type, > > + const struct bpf_prog *prog, > > + struct bpf_insn_access_aux *info) > > +{ > > + struct btf *btf; > > + > > + if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx)) > > + return false; > > + > > + switch (off) { > > + case offsetof(struct bpf_qdisc_ctx, skb): > > + if (type == BPF_WRITE || > > + size != sizeof_field(struct bpf_qdisc_ctx, skb)) > > + return false; > > + > > + if (prog->expected_attach_type != BPF_QDISC_ENQUEUE) > > + return false; > > + > > + btf = bpf_get_btf_vmlinux(); > > + if (IS_ERR_OR_NULL(btf)) > > + return false; > > + > > + info->btf = btf; > > + info->btf_id = tc_qdisc_ctx_access_btf_ids[0]; > > + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; > > + return true; > > + case bpf_ctx_range(struct bpf_qdisc_ctx, classid): > > + return size == sizeof_field(struct bpf_qdisc_ctx, classid); > > + case bpf_ctx_range(struct bpf_qdisc_ctx, expire): > > + return size == sizeof_field(struct bpf_qdisc_ctx, expire); > > + case bpf_ctx_range(struct bpf_qdisc_ctx, delta_ns): > > + return size == sizeof_field(struct bpf_qdisc_ctx, delta_ns); > > + default: > > + return false; > > + } > > + > > + return false; > > +} > > + > > [ ... ] > > > +static int sch_bpf_enqueue(struct sk_buff *skb, struct Qdisc *sch, > > + struct sk_buff **to_free) > > +{ > > + struct bpf_sched_data *q = qdisc_priv(sch); > > + unsigned int len = qdisc_pkt_len(skb); > > + struct bpf_qdisc_ctx ctx = {}; > > + int res = NET_XMIT_SUCCESS; > > + struct sch_bpf_class *cl; > > + struct bpf_prog *enqueue; > > + > > + enqueue = rcu_dereference(q->enqueue_prog.prog); > > + if (!enqueue) > > + return NET_XMIT_DROP; > > + > > + ctx.skb = skb; > > + ctx.classid = sch->handle; > > + res = bpf_prog_run(enqueue, &ctx); > > + switch (res) { > > + case SCH_BPF_THROTTLE: > > + qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns); > > + qdisc_qstats_overlimit(sch); > > + fallthrough; > > + case SCH_BPF_QUEUED: > > + qdisc_qstats_backlog_inc(sch, skb); > > + return NET_XMIT_SUCCESS; > > + case SCH_BPF_BYPASS: > > + qdisc_qstats_drop(sch); > > + __qdisc_drop(skb, to_free); > > + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; > > + case SCH_BPF_STOLEN: > > + __qdisc_drop(skb, to_free); > > + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; > > + case SCH_BPF_CN: > > + return NET_XMIT_CN; > > + case SCH_BPF_PASS: > > + break; > > + default: > > + return qdisc_drop(skb, sch, to_free); > > + } > > + > > + cl = sch_bpf_find(sch, ctx.classid); > > + if (!cl || !cl->qdisc) > > + return qdisc_drop(skb, sch, to_free); > > + > > + res = qdisc_enqueue(skb, cl->qdisc, to_free); > > + if (res != NET_XMIT_SUCCESS) { > > + if (net_xmit_drop_count(res)) { > > + qdisc_qstats_drop(sch); > > + cl->drops++; > > + } > > + return res; > > + } > > + > > + sch->qstats.backlog += len; > > + sch->q.qlen++; > > + return res; > > +} > > + > > +DEFINE_PER_CPU(struct sk_buff*, bpf_skb_dequeue); > > + > > +static struct sk_buff *sch_bpf_dequeue(struct Qdisc *sch) > > +{ > > + struct bpf_sched_data *q = qdisc_priv(sch); > > + struct bpf_qdisc_ctx ctx = {}; > > + struct sk_buff *skb = NULL; > > + struct bpf_prog *dequeue; > > + struct sch_bpf_class *cl; > > + int res; > > + > > + dequeue = rcu_dereference(q->dequeue_prog.prog); > > + if (!dequeue) > > + return NULL; > > + > > + __this_cpu_write(bpf_skb_dequeue, NULL); > > + ctx.classid = sch->handle; > > + res = bpf_prog_run(dequeue, &ctx); > > + switch (res) { > > + case SCH_BPF_DEQUEUED: > > + skb = __this_cpu_read(bpf_skb_dequeue); > > + qdisc_bstats_update(sch, skb); > > + qdisc_qstats_backlog_dec(sch, skb); > > + break; > > + case SCH_BPF_THROTTLE: > > + qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns); > > + qdisc_qstats_overlimit(sch); > > + cl = sch_bpf_find(sch, ctx.classid); > > + if (cl) > > + cl->overlimits++; > > + return NULL; > > + case SCH_BPF_PASS: > > + cl = sch_bpf_find(sch, ctx.classid); > > + if (!cl || !cl->qdisc) > > + return NULL; > > + skb = qdisc_dequeue_peeked(cl->qdisc); > > + if (skb) { > > + bstats_update(&cl->bstats, skb); > > + qdisc_bstats_update(sch, skb); > > + qdisc_qstats_backlog_dec(sch, skb); > > + sch->q.qlen--; > > + } > > + break; > > + } > > + > > + return skb; > > +} > > [ ... ] > > > +static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt, > > + struct netlink_ext_ack *extack) > > +{ > > + struct bpf_sched_data *q = qdisc_priv(sch); > > + int err; > > + > > + qdisc_watchdog_init(&q->watchdog, sch); > > + if (opt) { > > + err = sch_bpf_change(sch, opt, extack); > > + if (err) > > + return err; > > + } > > + > > + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); > > + if (err) > > + return err; > > + > > + return qdisc_class_hash_init(&q->clhash); > > +} > > + > > +static void sch_bpf_reset(struct Qdisc *sch) > > +{ > > + struct bpf_sched_data *q = qdisc_priv(sch); > > + struct sch_bpf_class *cl; > > + unsigned int i; > > + > > + for (i = 0; i < q->clhash.hashsize; i++) { > > + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { > > + if (cl->qdisc) > > + qdisc_reset(cl->qdisc); > > + } > > + } > > + > > + qdisc_watchdog_cancel(&q->watchdog); > > +} > > + > > [ ... ] > > > +static const struct Qdisc_class_ops sch_bpf_class_ops = { > > + .graft = sch_bpf_graft, > > + .leaf = sch_bpf_leaf, > > + .find = sch_bpf_search, > > + .change = sch_bpf_change_class, > > + .delete = sch_bpf_delete, > > + .tcf_block = sch_bpf_tcf_block, > > + .bind_tcf = sch_bpf_bind, > > + .unbind_tcf = sch_bpf_unbind, > > + .dump = sch_bpf_dump_class, > > + .dump_stats = sch_bpf_dump_class_stats, > > + .walk = sch_bpf_walk, > > +}; > > + > > +static struct Qdisc_ops sch_bpf_qdisc_ops __read_mostly = { > > + .cl_ops = &sch_bpf_class_ops, > > + .id = "bpf", > > + .priv_size = sizeof(struct bpf_sched_data), > > + .enqueue = sch_bpf_enqueue, > > + .dequeue = sch_bpf_dequeue, > > I looked at the high level of the patchset. The major ops that it wants to be > programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in > patch 4 and patch 5). > > This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for > ".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the > uapi. It is no long an acceptable way to add new bpf extension. > > Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented > in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops > can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api. > Partially. If using struct_ops, I think we'll need another structure like the following in bpf qdisc to be implemented with struct_ops bpf: struct bpf_qdisc_ops { int (*enqueue) (struct sk_buff *skb) void (*dequeue) (void) void (*init) (void) void (*reset) (void) }; Then, Qdisc_ops will wrap around them to handle things that cannot be implemented with bpf (e.g., sch_tree_lock, returning a skb ptr). > If other ops (like ".dump", ".dump_stats"...) do not have good use case to be > programmable in bpf, it can stay with the kernel implementation for now and only > allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset > implemented. > > You mentioned in the cover letter that: > "Current struct_ops attachment model does not seem to support replacing only > functions of a specific instance of a module, but I might be wrong." > > I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes, > it can be done through the struct_ops's ".init_member". Take a look at > bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for > ".dump" (for example) when loading the bpf Qdisc_ops. > I have no problem with partially replacing a struct, which like you mentioned has been demonstrated by congestion control or sched_ext. What I am not sure about is the ability to create multiple bpf qdiscs, where each has different ".enqueue", ".dequeue", and so on. I like the struct_ops approach and would love to try it if struct_ops support this. Thanks, Amery > > + .peek = qdisc_peek_dequeued, > > + .init = sch_bpf_init, > > + .reset = sch_bpf_reset, > + .destroy = sch_bpf_destroy, > > + .change = sch_bpf_change, > > + .dump = sch_bpf_dump, > > + .dump_stats = sch_bpf_dump_stats, > > + .owner = THIS_MODULE, > > +}; > > + > > +static int __init sch_bpf_mod_init(void) > > +{ > > + return register_qdisc(&sch_bpf_qdisc_ops); > > +} > > + > > +static void __exit sch_bpf_mod_exit(void) > > +{ > > + unregister_qdisc(&sch_bpf_qdisc_ops); > > +} >