On 3/14/22 3:15 PM, xiangxia.m.yue@xxxxxxxxx wrote:
[...]
include/linux/netdevice.h | 3 +++
include/linux/rtnetlink.h | 1 +
net/core/dev.c | 31 +++++++++++++++++++++++++++++--
net/sched/act_skbedit.c | 6 +++++-
4 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0d994710b335..f33fb2d6712a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3065,6 +3065,9 @@ struct softnet_data {
struct {
u16 recursion;
u8 more;
+#ifdef CONFIG_NET_EGRESS
+ u8 skip_txqueue;
+#endif
} xmit;
#ifdef CONFIG_RPS
/* input_queue_head should be written by cpu owning this struct,
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 7f970b16da3a..ae2c6a3cec5d 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -100,6 +100,7 @@ void net_dec_ingress_queue(void);
#ifdef CONFIG_NET_EGRESS
void net_inc_egress_queue(void);
void net_dec_egress_queue(void);
+void netdev_xmit_skip_txqueue(bool skip);
#endif
void rtnetlink_init(void);
diff --git a/net/core/dev.c b/net/core/dev.c
index 75bab5b0dbae..8e83b7099977 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3908,6 +3908,25 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
return skb;
}
+
+static struct netdev_queue *
+netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
+{
+ int qm = skb_get_queue_mapping(skb);
+
+ return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
+}
+
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return __this_cpu_read(softnet_data.xmit.skip_txqueue);
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_XPS
@@ -4078,7 +4097,7 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
- struct netdev_queue *txq;
+ struct netdev_queue *txq = NULL;
struct Qdisc *q;
int rc = -ENOMEM;
bool again = false;
@@ -4106,11 +4125,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
if (!skb)
goto out;
}
+
+ netdev_xmit_skip_txqueue(false);
+
nf_skip_egress(skb, true);
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
nf_skip_egress(skb, false);
+
+ if (netdev_xmit_txqueue_skipped())
+ txq = netdev_tx_queue_mapping(dev, skb);
}
#endif
/* If device/qdisc don't need skb->dst, release it right now while
@@ -4121,7 +4146,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
else
skb_dst_force(skb);
- txq = netdev_core_pick_tx(dev, skb, sb_dev);
+ if (likely(!txq))
nit: Drop likely(). If the feature is used from sch_handle_egress(), then this would always be the case.
+ txq = netdev_core_pick_tx(dev, skb, sb_dev);
+
q = rcu_dereference_bh(txq->qdisc);
How will the `netdev_xmit_skip_txqueue(true)` be usable from BPF side (see bpf_convert_ctx_access() ->
queue_mapping)?
trace_net_dev_queue(skb);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index ceba11b198bb..d5799b4fc499 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -58,8 +58,12 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
}
}
if (params->flags & SKBEDIT_F_QUEUE_MAPPING &&
- skb->dev->real_num_tx_queues > params->queue_mapping)
+ skb->dev->real_num_tx_queues > params->queue_mapping) {
+#ifdef CONFIG_NET_EGRESS
+ netdev_xmit_skip_txqueue(true);
+#endif
skb_set_queue_mapping(skb, params->queue_mapping);
+ }
if (params->flags & SKBEDIT_F_MARK) {
skb->mark &= ~params->mask;
skb->mark |= params->mark & params->mask;