On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@xxxxxxxxxxxxx> wrote: > > On 9/20/23 1:20 AM, Jamal Hadi Salim wrote: > > On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@xxxxxxxxxxxxx> wrote: > >> > >> [ +Martin, bpf ] > >> > >> On 9/19/23 4:59 PM, Victor Nogueira wrote: > >>> Currently there is no way to distinguish between an error and a > >>> classification verdict. This patch adds the verdict field as a part of > >>> struct tcf_result. That way, tcf_classify can return a proper > >>> error number when it fails, and we keep the classification result > >>> information encapsulated in struct tcf_result. > >>> > >>> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and > >>> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason. > >>> With that we can distinguish between a drop from a processing error versus > >>> a drop from classification. > >>> > >>> Signed-off-by: Victor Nogueira <victor@xxxxxxxxxxxx> > >>> --- > >>> include/net/dropreason-core.h | 6 +++++ > >>> include/net/sch_generic.h | 7 ++++++ > >>> net/core/dev.c | 42 ++++++++++++++++++++++++++--------- > >>> net/sched/cls_api.c | 38 ++++++++++++++++++++----------- > >>> net/sched/sch_cake.c | 32 +++++++++++++------------- > >>> net/sched/sch_drr.c | 33 +++++++++++++-------------- > >>> net/sched/sch_ets.c | 6 +++-- > >>> net/sched/sch_fq_codel.c | 29 ++++++++++++------------ > >>> net/sched/sch_fq_pie.c | 28 +++++++++++------------ > >>> net/sched/sch_hfsc.c | 6 +++-- > >>> net/sched/sch_htb.c | 6 +++-- > >>> net/sched/sch_multiq.c | 6 +++-- > >>> net/sched/sch_prio.c | 7 ++++-- > >>> net/sched/sch_qfq.c | 34 +++++++++++++--------------- > >>> net/sched/sch_sfb.c | 29 ++++++++++++------------ > >>> net/sched/sch_sfq.c | 28 +++++++++++------------ > >>> 16 files changed, 195 insertions(+), 142 deletions(-) > >>> > >>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h > >>> index a587e83fc169..b1c069c8e7f2 100644 > >>> --- a/include/net/dropreason-core.h > >>> +++ b/include/net/dropreason-core.h > >>> @@ -80,6 +80,8 @@ > >>> FN(IPV6_NDISC_BAD_OPTIONS) \ > >>> FN(IPV6_NDISC_NS_OTHERHOST) \ > >>> FN(QUEUE_PURGE) \ > >>> + FN(TC_EGRESS_ERROR) \ > >>> + FN(TC_INGRESS_ERROR) \ > >>> FNe(MAX) > >>> > >>> /** > >>> @@ -345,6 +347,10 @@ enum skb_drop_reason { > >>> SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, > >>> /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ > >>> SKB_DROP_REASON_QUEUE_PURGE, > >>> + /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */ > >>> + SKB_DROP_REASON_TC_EGRESS_ERROR, > >>> + /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */ > >>> + SKB_DROP_REASON_TC_INGRESS_ERROR, > >>> /** > >>> * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which > >>> * shouldn't be used as a real 'reason' - only for tracing code gen > >>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h > >>> index f232512505f8..9a3f71d2545e 100644 > >>> --- a/include/net/sch_generic.h > >>> +++ b/include/net/sch_generic.h > >>> @@ -326,6 +326,7 @@ struct Qdisc_ops { > >>> > >>> > >>> struct tcf_result { > >>> + u32 verdict; > >>> union { > >>> struct { > >>> unsigned long class; > >>> @@ -336,6 +337,12 @@ struct tcf_result { > >>> }; > >>> }; > >>> > >>> +static inline void tcf_result_set_verdict(struct tcf_result *res, > >>> + const u32 verdict) > >>> +{ > >>> + res->verdict = verdict; > >>> +} > >>> + > >>> struct tcf_chain; > >>> > >>> struct tcf_proto_ops { > >>> diff --git a/net/core/dev.c b/net/core/dev.c > >>> index ccff2b6ef958..1450f4741d9b 100644 > >>> --- a/net/core/dev.c > >>> +++ b/net/core/dev.c > >>> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); > >>> #endif /* CONFIG_NET_EGRESS */ > >>> > >>> #ifdef CONFIG_NET_XGRESS > >>> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) > >>> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, > >>> + struct tcf_result *res) > >>> { > >>> - int ret = TC_ACT_UNSPEC; > >>> + int ret = 0; > >>> #ifdef CONFIG_NET_CLS_ACT > >>> struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); > >>> - struct tcf_result res; > >>> > >>> - if (!miniq) > >>> + if (!miniq) { > >>> + tcf_result_set_verdict(res, TC_ACT_UNSPEC); > >>> return ret; > >>> + } > >>> > >>> tc_skb_cb(skb)->mru = 0; > >>> tc_skb_cb(skb)->post_ct = false; > >>> > >>> mini_qdisc_bstats_cpu_update(miniq, skb); > >>> - ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); > >>> + ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false); > >>> + if (ret < 0) { > >>> + mini_qdisc_qstats_cpu_drop(miniq); > >>> + return ret; > >>> + } > >>> /* Only tcf related quirks below. */ > >>> - switch (ret) { > >>> + switch (res->verdict) { > >>> case TC_ACT_SHOT: > >>> mini_qdisc_qstats_cpu_drop(miniq); > >>> break; > >>> case TC_ACT_OK: > >>> case TC_ACT_RECLASSIFY: > >>> - skb->tc_index = TC_H_MIN(res.classid); > >>> + skb->tc_index = TC_H_MIN(res->classid); > >>> break; > >>> } > >>> +#else > >>> + tcf_result_set_verdict(res, TC_ACT_UNSPEC); > >>> #endif /* CONFIG_NET_CLS_ACT */ > >>> return ret; > >>> } > >>> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, > >>> struct net_device *orig_dev, bool *another) > >>> { > >>> struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); > >>> + struct tcf_result res = {0}; > >>> int sch_ret; > >>> > >>> if (!entry) > >>> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, > >>> if (sch_ret != TC_ACT_UNSPEC) > >>> goto ingress_verdict; > >>> } > >>> - sch_ret = tc_run(tcx_entry(entry), skb); > >>> + sch_ret = tc_run(tcx_entry(entry), skb, &res); > >>> + if (sch_ret < 0) { > >>> + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR); > >>> + *ret = NET_RX_DROP; > >>> + return NULL; > >>> + } > >>> ingress_verdict: > >>> - switch (sch_ret) { > >>> + switch (res.verdict) { > >> > >> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress() > >> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT > >> in such case. > > > > I think it is valuable to have a good reason code like > > SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs > > verdicts in the case of tc_run() variant. > > For tcx_run(), does this look ok (for consistency)?: > > > > if (static_branch_unlikely(&tcx_needed_key)) { > > sch_ret = tcx_run(entry, skb, true); > > if (sch_ret != TC_ACT_UNSPEC) { > > res.verdict = sch_ret; > > goto ingress_verdict; > > } > > } > > In the above case we don't have 'internal' errors which you want to trace, so I would > also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for > every packet. We can move the zeroing inside tc_run() but we declare it in the same spot as we do right now. You will still need to set res.verdict as above. Would that work for you? > I was more thinking like something below could be a better choice. I presume your main > goal is to trace where these errors originated in the first place, so it might even be > useful to capture the actual return code as well. The main motivation is a few syzkaller bugs which resulted in not disambiguating between errors being returned and sometimes TC_ACT_SHOT. > Then you can use perf script, bpf and whatnot to gather further insights into what > happened while being less invasive and avoiding the need to extend struct tcf_result. > We could use trace instead - the reason we have the skb reason is being used in the other spots (does this trace require ebpf to be usable?). > This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee > that in fast path all errors are < TC_ACT_UNSPEC anyway. > I am not sure i followed. 0 means success, result codes are returned in res now. cheers, jamal > diff --git a/net/core/dev.c b/net/core/dev.c > index 85df22f05c38..4089d195144d 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) > > mini_qdisc_bstats_cpu_update(miniq, skb); > ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); > + if (unlikely(ret < TC_ACT_UNSPEC)) { > + trace_tc_exception(skb->dev, skb->tc_at_ingress, ret); > + ret = TC_ACT_SHOT; > + } > /* Only tcf related quirks below. */ > switch (ret) { > case TC_ACT_SHOT: > > Best, > Daniel