From: Peilin Ye <peilin.ye@xxxxxxxxxxxxx> Traffic redirected by bpf_redirect_peer() (used by recent CNIs like Cilium) is not accounted for in the RX stats of veth devices, confusing user space metrics collectors such as cAdvisor [1], as reported by Youlun. Currently veth devices use the @lstats per-CPU counters, which only cover TX traffic. veth_get_stats64() actually collects RX stats of a veth device from its peer's TX (@lstats) counters, based on the assumption that a veth device can _only_ receive packets from its peer, which is no longer true. Instead, use @tstats to maintain both per-CPU RX and TX traffic counters for each veth device, and count bpf_redirect_peer() traffic in skb_do_redirect(). veth_stats_rx() might need a name change (perhaps to "veth_stats_xdp()") for less confusion, but let's leave it to a separate patch to keep this fix minimal. [1] Specifically, the "container_network_receive_{byte,packet}s_total" counters are affected. Reported-by: Youlun Zhang <zhangyoulun@xxxxxxxxxxxxx> Fixes: 9aa1206e8f48 ("bpf: Add redirect_peer helper") Cc: Jiang Wang <jiang.wang@xxxxxxxxxxxxx> Signed-off-by: Peilin Ye <peilin.ye@xxxxxxxxxxxxx> --- drivers/net/veth.c | 36 ++++++++++++++---------------------- net/core/filter.c | 1 + 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 9980517ed8b0..df7a7c21a46d 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -373,7 +373,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) skb_tx_timestamp(skb); if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { if (!use_napi) - dev_lstats_add(dev, length); + dev_sw_netstats_tx_add(dev, 1, length); else __veth_xdp_flush(rq); } else { @@ -387,14 +387,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) return ret; } -static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) -{ - struct veth_priv *priv = netdev_priv(dev); - - dev_lstats_read(dev, packets, bytes); - return atomic64_read(&priv->dropped); -} - static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -432,24 +424,24 @@ static void veth_get_stats64(struct net_device *dev, struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; - u64 packets, bytes; - tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); - tot->tx_bytes = bytes; - tot->tx_packets = packets; + tot->tx_dropped = atomic64_read(&priv->dropped); + dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; - tot->rx_bytes = rx.xdp_bytes; - tot->rx_packets = rx.xdp_packets; + tot->rx_bytes += rx.xdp_bytes; + tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { - veth_stats_tx(peer, &packets, &bytes); - tot->rx_bytes += bytes; - tot->rx_packets += packets; + struct rtnl_link_stats64 tot_peer = {}; + + dev_fetch_sw_netstats(&tot_peer, peer->tstats); + tot->rx_bytes += tot_peer.tx_bytes; + tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; @@ -1508,13 +1500,13 @@ static int veth_dev_init(struct net_device *dev) { int err; - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); - if (!dev->lstats) + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) return -ENOMEM; err = veth_alloc_queues(dev); if (err) { - free_percpu(dev->lstats); + free_percpu(dev->tstats); return err; } @@ -1524,7 +1516,7 @@ static int veth_dev_init(struct net_device *dev) static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); - free_percpu(dev->lstats); + free_percpu(dev->tstats); } #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/net/core/filter.c b/net/core/filter.c index 21d75108c2e9..7aca28b7d0fd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2492,6 +2492,7 @@ int skb_do_redirect(struct sk_buff *skb) net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; + dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? -- 2.20.1