Allow developers to implement customized reset logic through an optional reset program. The program also takes bpf_qdisc_ctx as context, but currently cannot access any field. To release skbs, the program can release all references to bpf list or rbtree serving as skb queues. The destructor kfunc bpf_skb_destroy() will be called by bpf_map_free_deferred(). This prevents the qdisc from holding the sch_tree_lock for too long when there are many packets in the qdisc. Signed-off-by: Amery Hung <amery.hung@xxxxxxxxxxxxx> --- include/uapi/linux/bpf.h | 1 + include/uapi/linux/pkt_sched.h | 4 ++++ kernel/bpf/syscall.c | 1 + net/core/filter.c | 3 +++ net/sched/sch_bpf.c | 30 ++++++++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 36 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df280bbb7c0d..84669886a493 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1059,6 +1059,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_QDISC_ENQUEUE, BPF_QDISC_DEQUEUE, + BPF_QDISC_RESET, __MAX_BPF_ATTACH_TYPE }; diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d05462309f5a..e9e1a83c22f7 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1328,6 +1328,10 @@ enum { TCA_SCH_BPF_DEQUEUE_PROG_FD, /* u32 */ TCA_SCH_BPF_DEQUEUE_PROG_ID, /* u32 */ TCA_SCH_BPF_DEQUEUE_PROG_TAG, /* data */ + TCA_SCH_BPF_RESET_PROG_NAME, /* string */ + TCA_SCH_BPF_RESET_PROG_FD, /* u32 */ + TCA_SCH_BPF_RESET_PROG_ID, /* u32 */ + TCA_SCH_BPF_RESET_PROG_TAG, /* data */ __TCA_SCH_BPF_MAX, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1838bddd8526..9af6fa542f2e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2506,6 +2506,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, switch (expected_attach_type) { case BPF_QDISC_ENQUEUE: case BPF_QDISC_DEQUEUE: + case BPF_QDISC_RESET: return 0; default: return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c index f25a0b6b5d56..f8e17465377f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8905,6 +8905,9 @@ static bool tc_qdisc_is_valid_access(int off, int size, { struct btf *btf; + if (prog->expected_attach_type == BPF_QDISC_RESET) + return false; + if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx)) return false; diff --git a/net/sched/sch_bpf.c b/net/sched/sch_bpf.c index 1910a58a3352..3f0f809dced6 100644 --- a/net/sched/sch_bpf.c +++ b/net/sched/sch_bpf.c @@ -42,6 +42,7 @@ struct bpf_sched_data { struct Qdisc_class_hash clhash; struct sch_bpf_prog __rcu enqueue_prog; struct sch_bpf_prog __rcu dequeue_prog; + struct sch_bpf_prog __rcu reset_prog; struct qdisc_watchdog watchdog; }; @@ -51,6 +52,9 @@ static int sch_bpf_dump_prog(const struct sch_bpf_prog *prog, struct sk_buff *sk { struct nlattr *nla; + if (!prog->prog) + return 0; + if (prog->name && nla_put_string(skb, name, prog->name)) return -EMSGSIZE; @@ -81,6 +85,9 @@ static int sch_bpf_dump(struct Qdisc *sch, struct sk_buff *skb) if (sch_bpf_dump_prog(&q->dequeue_prog, skb, TCA_SCH_BPF_DEQUEUE_PROG_NAME, TCA_SCH_BPF_DEQUEUE_PROG_ID, TCA_SCH_BPF_DEQUEUE_PROG_TAG)) goto nla_put_failure; + if (sch_bpf_dump_prog(&q->reset_prog, skb, TCA_SCH_BPF_RESET_PROG_NAME, + TCA_SCH_BPF_RESET_PROG_ID, TCA_SCH_BPF_RESET_PROG_TAG)) + goto nla_put_failure; return nla_nest_end(skb, opts); @@ -259,16 +266,21 @@ static const struct nla_policy sch_bpf_policy[TCA_SCH_BPF_MAX + 1] = { [TCA_SCH_BPF_DEQUEUE_PROG_FD] = { .type = NLA_U32 }, [TCA_SCH_BPF_DEQUEUE_PROG_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, + [TCA_SCH_BPF_RESET_PROG_FD] = { .type = NLA_U32 }, + [TCA_SCH_BPF_RESET_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = ACT_BPF_NAME_LEN }, }; -static int bpf_init_prog(struct nlattr *fd, struct nlattr *name, struct sch_bpf_prog *prog) +static int bpf_init_prog(struct nlattr *fd, struct nlattr *name, + struct sch_bpf_prog *prog, bool optional) { struct bpf_prog *fp, *old_fp; char *prog_name = NULL; u32 bpf_fd; if (!fd) - return -EINVAL; + return optional ? 0 : -EINVAL; + bpf_fd = nla_get_u32(fd); fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_QDISC); @@ -327,11 +339,15 @@ static int sch_bpf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); err = bpf_init_prog(tb[TCA_SCH_BPF_ENQUEUE_PROG_FD], - tb[TCA_SCH_BPF_ENQUEUE_PROG_NAME], &q->enqueue_prog); + tb[TCA_SCH_BPF_ENQUEUE_PROG_NAME], &q->enqueue_prog, false); if (err) goto failure; err = bpf_init_prog(tb[TCA_SCH_BPF_DEQUEUE_PROG_FD], - tb[TCA_SCH_BPF_DEQUEUE_PROG_NAME], &q->dequeue_prog); + tb[TCA_SCH_BPF_DEQUEUE_PROG_NAME], &q->dequeue_prog, false); + if (err) + goto failure; + err = bpf_init_prog(tb[TCA_SCH_BPF_RESET_PROG_FD], + tb[TCA_SCH_BPF_RESET_PROG_NAME], &q->reset_prog, true); failure: sch_tree_unlock(sch); return err; @@ -360,7 +376,9 @@ static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt, static void sch_bpf_reset(struct Qdisc *sch) { struct bpf_sched_data *q = qdisc_priv(sch); + struct bpf_qdisc_ctx ctx = {}; struct sch_bpf_class *cl; + struct bpf_prog *reset; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { @@ -371,6 +389,9 @@ static void sch_bpf_reset(struct Qdisc *sch) } qdisc_watchdog_cancel(&q->watchdog); + reset = rcu_dereference(q->reset_prog.prog); + if (reset) + bpf_prog_run(reset, &ctx); } static void sch_bpf_destroy_class(struct Qdisc *sch, struct sch_bpf_class *cl) @@ -398,6 +419,7 @@ static void sch_bpf_destroy(struct Qdisc *sch) sch_tree_lock(sch); bpf_cleanup_prog(&q->enqueue_prog); bpf_cleanup_prog(&q->dequeue_prog); + bpf_cleanup_prog(&q->reset_prog); sch_tree_unlock(sch); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index df280bbb7c0d..84669886a493 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1059,6 +1059,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_QDISC_ENQUEUE, BPF_QDISC_DEQUEUE, + BPF_QDISC_RESET, __MAX_BPF_ATTACH_TYPE }; -- 2.20.1