Handle XDP_REDIRECT action in netvsc driver. Also, transparently pass ndo_xdp_xmit to VF when available. Signed-off-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> --- drivers/net/hyperv/hyperv_net.h | 69 ++++++++++++++- drivers/net/hyperv/netvsc.c | 8 +- drivers/net/hyperv/netvsc_bpf.c | 95 +++++++++++++++++++- drivers/net/hyperv/netvsc_drv.c | 150 +++++++++++++------------------- 4 files changed, 228 insertions(+), 94 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index cf69da0e296c..25b38a374e3c 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -15,6 +15,7 @@ #include <linux/list.h> #include <linux/hyperv.h> #include <linux/rndis.h> +#include <linux/jhash.h> /* RSS related */ #define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203 /* query only */ @@ -237,6 +238,7 @@ int netvsc_recv_callback(struct net_device *net, void netvsc_channel_cb(void *context); int netvsc_poll(struct napi_struct *napi, int budget); +void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev); u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, struct xdp_buff *xdp); unsigned int netvsc_xdp_fraglen(unsigned int len); @@ -246,6 +248,8 @@ int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netvsc_device *nvdev); int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog); int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf); +int netvsc_ndoxdp_xmit(struct net_device *ndev, int n, + struct xdp_frame **frames, u32 flags); int rndis_set_subchannel(struct net_device *ndev, struct netvsc_device *nvdev, @@ -942,12 +946,21 @@ struct nvsc_rsc { #define NVSC_RSC_CSUM_INFO BIT(1) /* valid/present bit for 'csum_info' */ #define NVSC_RSC_HASH_INFO BIT(2) /* valid/present bit for 'hash_info' */ -struct netvsc_stats { +struct netvsc_stats_tx { + u64 packets; + u64 bytes; + u64 xdp_xmit; + struct u64_stats_sync syncp; +}; + +struct netvsc_stats_rx { u64 packets; u64 bytes; u64 broadcast; u64 multicast; u64 xdp_drop; + u64 xdp_redirect; + u64 xdp_tx; struct u64_stats_sync syncp; }; @@ -1046,6 +1059,55 @@ struct net_device_context { struct netvsc_device_info *saved_netvsc_dev_info; }; +/* Azure hosts don't support non-TCP port numbers in hashing for fragmented + * packets. We can use ethtool to change UDP hash level when necessary. + */ +static inline u32 netvsc_get_hash(struct sk_buff *skb, + const struct net_device_context *ndc) +{ + struct flow_keys flow; + u32 hash, pkt_proto = 0; + static u32 hashrnd __read_mostly; + + net_get_random_once(&hashrnd, sizeof(hashrnd)); + + if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) + return 0; + + switch (flow.basic.ip_proto) { + case IPPROTO_TCP: + if (flow.basic.n_proto == htons(ETH_P_IP)) + pkt_proto = HV_TCP4_L4HASH; + else if (flow.basic.n_proto == htons(ETH_P_IPV6)) + pkt_proto = HV_TCP6_L4HASH; + + break; + + case IPPROTO_UDP: + if (flow.basic.n_proto == htons(ETH_P_IP)) + pkt_proto = HV_UDP4_L4HASH; + else if (flow.basic.n_proto == htons(ETH_P_IPV6)) + pkt_proto = HV_UDP6_L4HASH; + + break; + } + + if (pkt_proto & ndc->l4_hash) { + return skb_get_hash(skb); + } else { + if (flow.basic.n_proto == htons(ETH_P_IP)) + hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd); + else if (flow.basic.n_proto == htons(ETH_P_IPV6)) + hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd); + else + return 0; + + __skb_set_sw_hash(skb, hash, false); + } + + return hash; +} + /* Per channel data */ struct netvsc_channel { struct vmbus_channel *channel; @@ -1060,9 +1122,10 @@ struct netvsc_channel { struct bpf_prog __rcu *bpf_prog; struct xdp_rxq_info xdp_rxq; + bool xdp_flush; - struct netvsc_stats tx_stats; - struct netvsc_stats rx_stats; + struct netvsc_stats_tx tx_stats; + struct netvsc_stats_rx rx_stats; }; /* Per netvsc device */ diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 9442f751ad3a..240f88d9c520 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -20,6 +20,7 @@ #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <linux/prefetch.h> +#include <linux/filter.h> #include <asm/sync_bitops.h> #include <asm/mshyperv.h> @@ -805,7 +806,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev, struct hv_netvsc_packet *packet = (struct hv_netvsc_packet *)skb->cb; u32 send_index = packet->send_buf_index; - struct netvsc_stats *tx_stats; + struct netvsc_stats_tx *tx_stats; if (send_index != NETVSC_INVALID_INDEX) netvsc_free_send_slot(net_device, send_index); @@ -1670,12 +1671,17 @@ int netvsc_poll(struct napi_struct *napi, int budget) if (!nvchan->desc) nvchan->desc = hv_pkt_iter_first(channel); + nvchan->xdp_flush = false; + while (nvchan->desc && work_done < budget) { work_done += netvsc_process_raw_pkt(device, nvchan, net_device, ndev, nvchan->desc, budget); nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc); } + if (nvchan->xdp_flush) + xdp_do_flush(); + /* Send any pending receive completions */ ret = send_recv_completions(ndev, net_device, nvchan); diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 7856905414eb..d0c8e54d4b1f 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -10,6 +10,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> +#include <linux/netpoll.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/kernel.h> @@ -23,11 +24,13 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, struct xdp_buff *xdp) { + struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats; void *data = nvchan->rsc.data[0]; u32 len = nvchan->rsc.len[0]; struct page *page = NULL; struct bpf_prog *prog; u32 act = XDP_PASS; + bool drop = true; xdp->data_hard_start = NULL; @@ -60,9 +63,34 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, switch (act) { case XDP_PASS: case XDP_TX: + drop = false; + break; + case XDP_DROP: break; + case XDP_REDIRECT: + if (!xdp_do_redirect(ndev, xdp, prog)) { + nvchan->xdp_flush = true; + drop = false; + + u64_stats_update_begin(&rx_stats->syncp); + + rx_stats->xdp_redirect++; + rx_stats->packets++; + rx_stats->bytes += nvchan->rsc.pktlen; + + u64_stats_update_end(&rx_stats->syncp); + + break; + } else { + u64_stats_update_begin(&rx_stats->syncp); + rx_stats->xdp_drop++; + u64_stats_update_end(&rx_stats->syncp); + } + + fallthrough; + case XDP_ABORTED: trace_xdp_exception(ndev, prog, act); break; @@ -74,7 +102,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, out: rcu_read_unlock(); - if (page && act != XDP_PASS && act != XDP_TX) { + if (page && drop) { __free_page(page); xdp->data_hard_start = NULL; } @@ -199,3 +227,68 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf) return -EINVAL; } } + +static int netvsc_ndoxdp_xmit_fm(struct net_device *ndev, + struct xdp_frame *frame, u16 q_idx) +{ + struct sk_buff *skb; + + skb = xdp_build_skb_from_frame(frame, ndev); + if (unlikely(!skb)) + return -ENOMEM; + + netvsc_get_hash(skb, netdev_priv(ndev)); + + skb_record_rx_queue(skb, q_idx); + + netvsc_xdp_xmit(skb, ndev); + + return 0; +} + +int netvsc_ndoxdp_xmit(struct net_device *ndev, int n, + struct xdp_frame **frames, u32 flags) +{ + struct net_device_context *ndev_ctx = netdev_priv(ndev); + const struct net_device_ops *vf_ops; + struct netvsc_stats_tx *tx_stats; + struct netvsc_device *nvsc_dev; + struct net_device *vf_netdev; + int i, count = 0; + u16 q_idx; + + /* Don't transmit if netvsc_device is gone */ + nvsc_dev = rcu_dereference_bh(ndev_ctx->nvdev); + if (unlikely(!nvsc_dev || nvsc_dev->destroy)) + return 0; + + /* If VF is present and up then redirect packets to it. + * Skip the VF if it is marked down or has no carrier. + * If netpoll is in uses, then VF can not be used either. + */ + vf_netdev = rcu_dereference_bh(ndev_ctx->vf_netdev); + if (vf_netdev && netif_running(vf_netdev) && + netif_carrier_ok(vf_netdev) && !netpoll_tx_running(ndev) && + vf_netdev->netdev_ops->ndo_xdp_xmit && + ndev_ctx->data_path_is_vf) { + vf_ops = vf_netdev->netdev_ops; + return vf_ops->ndo_xdp_xmit(vf_netdev, n, frames, flags); + } + + q_idx = smp_processor_id() % ndev->real_num_tx_queues; + + for (i = 0; i < n; i++) { + if (netvsc_ndoxdp_xmit_fm(ndev, frames[i], q_idx)) + break; + + count++; + } + + tx_stats = &nvsc_dev->chan_table[q_idx].tx_stats; + + u64_stats_update_begin(&tx_stats->syncp); + tx_stats->xdp_xmit += count; + u64_stats_update_end(&tx_stats->syncp); + + return count; +} diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index fde1c492ca02..27f6bbca6619 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -242,56 +242,6 @@ static inline void *init_ppi_data(struct rndis_message *msg, return ppi + 1; } -/* Azure hosts don't support non-TCP port numbers in hashing for fragmented - * packets. We can use ethtool to change UDP hash level when necessary. - */ -static inline u32 netvsc_get_hash( - struct sk_buff *skb, - const struct net_device_context *ndc) -{ - struct flow_keys flow; - u32 hash, pkt_proto = 0; - static u32 hashrnd __read_mostly; - - net_get_random_once(&hashrnd, sizeof(hashrnd)); - - if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) - return 0; - - switch (flow.basic.ip_proto) { - case IPPROTO_TCP: - if (flow.basic.n_proto == htons(ETH_P_IP)) - pkt_proto = HV_TCP4_L4HASH; - else if (flow.basic.n_proto == htons(ETH_P_IPV6)) - pkt_proto = HV_TCP6_L4HASH; - - break; - - case IPPROTO_UDP: - if (flow.basic.n_proto == htons(ETH_P_IP)) - pkt_proto = HV_UDP4_L4HASH; - else if (flow.basic.n_proto == htons(ETH_P_IPV6)) - pkt_proto = HV_UDP6_L4HASH; - - break; - } - - if (pkt_proto & ndc->l4_hash) { - return skb_get_hash(skb); - } else { - if (flow.basic.n_proto == htons(ETH_P_IP)) - hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd); - else if (flow.basic.n_proto == htons(ETH_P_IPV6)) - hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd); - else - return 0; - - __skb_set_sw_hash(skb, hash, false); - } - - return hash; -} - static inline int netvsc_get_tx_queue(struct net_device *ndev, struct sk_buff *skb, int old_idx) { @@ -804,7 +754,7 @@ void netvsc_linkstatus_callback(struct net_device *net, } /* This function should only be called after skb_record_rx_queue() */ -static void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev) +void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev) { int rc; @@ -925,7 +875,7 @@ int netvsc_recv_callback(struct net_device *net, struct vmbus_channel *channel = nvchan->channel; u16 q_idx = channel->offermsg.offer.sub_channel_index; struct sk_buff *skb; - struct netvsc_stats *rx_stats = &nvchan->rx_stats; + struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats; struct xdp_buff xdp; u32 act; @@ -934,6 +884,9 @@ int netvsc_recv_callback(struct net_device *net, act = netvsc_run_xdp(net, nvchan, &xdp); + if (act == XDP_REDIRECT) + return NVSP_STAT_SUCCESS; + if (act != XDP_PASS && act != XDP_TX) { u64_stats_update_begin(&rx_stats->syncp); rx_stats->xdp_drop++; @@ -958,6 +911,9 @@ int netvsc_recv_callback(struct net_device *net, * statistics will not work correctly. */ u64_stats_update_begin(&rx_stats->syncp); + if (act == XDP_TX) + rx_stats->xdp_tx++; + rx_stats->packets++; rx_stats->bytes += nvchan->rsc.pktlen; @@ -1353,28 +1309,29 @@ static void netvsc_get_pcpu_stats(struct net_device *net, /* fetch percpu stats of netvsc */ for (i = 0; i < nvdev->num_chn; i++) { const struct netvsc_channel *nvchan = &nvdev->chan_table[i]; - const struct netvsc_stats *stats; + const struct netvsc_stats_tx *tx_stats; + const struct netvsc_stats_rx *rx_stats; struct netvsc_ethtool_pcpu_stats *this_tot = &pcpu_tot[nvchan->channel->target_cpu]; u64 packets, bytes; unsigned int start; - stats = &nvchan->tx_stats; + tx_stats = &nvchan->tx_stats; do { - start = u64_stats_fetch_begin_irq(&stats->syncp); - packets = stats->packets; - bytes = stats->bytes; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + start = u64_stats_fetch_begin_irq(&tx_stats->syncp); + packets = tx_stats->packets; + bytes = tx_stats->bytes; + } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start)); this_tot->tx_bytes += bytes; this_tot->tx_packets += packets; - stats = &nvchan->rx_stats; + rx_stats = &nvchan->rx_stats; do { - start = u64_stats_fetch_begin_irq(&stats->syncp); - packets = stats->packets; - bytes = stats->bytes; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + start = u64_stats_fetch_begin_irq(&rx_stats->syncp); + packets = rx_stats->packets; + bytes = rx_stats->bytes; + } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start)); this_tot->rx_bytes += bytes; this_tot->rx_packets += packets; @@ -1406,27 +1363,28 @@ static void netvsc_get_stats64(struct net_device *net, for (i = 0; i < nvdev->num_chn; i++) { const struct netvsc_channel *nvchan = &nvdev->chan_table[i]; - const struct netvsc_stats *stats; + const struct netvsc_stats_tx *tx_stats; + const struct netvsc_stats_rx *rx_stats; u64 packets, bytes, multicast; unsigned int start; - stats = &nvchan->tx_stats; + tx_stats = &nvchan->tx_stats; do { - start = u64_stats_fetch_begin_irq(&stats->syncp); - packets = stats->packets; - bytes = stats->bytes; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + start = u64_stats_fetch_begin_irq(&tx_stats->syncp); + packets = tx_stats->packets; + bytes = tx_stats->bytes; + } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start)); t->tx_bytes += bytes; t->tx_packets += packets; - stats = &nvchan->rx_stats; + rx_stats = &nvchan->rx_stats; do { - start = u64_stats_fetch_begin_irq(&stats->syncp); - packets = stats->packets; - bytes = stats->bytes; - multicast = stats->multicast + stats->broadcast; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + start = u64_stats_fetch_begin_irq(&rx_stats->syncp); + packets = rx_stats->packets; + bytes = rx_stats->bytes; + multicast = rx_stats->multicast + rx_stats->broadcast; + } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start)); t->rx_bytes += bytes; t->rx_packets += packets; @@ -1515,8 +1473,8 @@ static const struct { /* statistics per queue (rx/tx packets/bytes) */ #define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats)) -/* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */ -#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5) +/* 8 statistics per queue (rx/tx packets/bytes, XDP actions) */ +#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 8) static int netvsc_get_sset_count(struct net_device *dev, int string_set) { @@ -1543,12 +1501,16 @@ static void netvsc_get_ethtool_stats(struct net_device *dev, struct net_device_context *ndc = netdev_priv(dev); struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev); const void *nds = &ndc->eth_stats; - const struct netvsc_stats *qstats; + const struct netvsc_stats_tx *tx_stats; + const struct netvsc_stats_rx *rx_stats; struct netvsc_vf_pcpu_stats sum; struct netvsc_ethtool_pcpu_stats *pcpu_sum; unsigned int start; u64 packets, bytes; u64 xdp_drop; + u64 xdp_redirect; + u64 xdp_tx; + u64 xdp_xmit; int i, j, cpu; if (!nvdev) @@ -1562,26 +1524,32 @@ static void netvsc_get_ethtool_stats(struct net_device *dev, data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset); for (j = 0; j < nvdev->num_chn; j++) { - qstats = &nvdev->chan_table[j].tx_stats; + tx_stats = &nvdev->chan_table[j].tx_stats; do { - start = u64_stats_fetch_begin_irq(&qstats->syncp); - packets = qstats->packets; - bytes = qstats->bytes; - } while (u64_stats_fetch_retry_irq(&qstats->syncp, start)); + start = u64_stats_fetch_begin_irq(&tx_stats->syncp); + packets = tx_stats->packets; + bytes = tx_stats->bytes; + xdp_xmit = tx_stats->xdp_xmit; + } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start)); data[i++] = packets; data[i++] = bytes; + data[i++] = xdp_xmit; - qstats = &nvdev->chan_table[j].rx_stats; + rx_stats = &nvdev->chan_table[j].rx_stats; do { - start = u64_stats_fetch_begin_irq(&qstats->syncp); - packets = qstats->packets; - bytes = qstats->bytes; - xdp_drop = qstats->xdp_drop; - } while (u64_stats_fetch_retry_irq(&qstats->syncp, start)); + start = u64_stats_fetch_begin_irq(&rx_stats->syncp); + packets = rx_stats->packets; + bytes = rx_stats->bytes; + xdp_drop = rx_stats->xdp_drop; + xdp_redirect = rx_stats->xdp_redirect; + xdp_tx = rx_stats->xdp_tx; + } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start)); data[i++] = packets; data[i++] = bytes; data[i++] = xdp_drop; + data[i++] = xdp_redirect; + data[i++] = xdp_tx; } pcpu_sum = kvmalloc_array(num_possible_cpus(), @@ -1622,9 +1590,12 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data) for (i = 0; i < nvdev->num_chn; i++) { ethtool_sprintf(&p, "tx_queue_%u_packets", i); ethtool_sprintf(&p, "tx_queue_%u_bytes", i); + ethtool_sprintf(&p, "tx_queue_%u_xdp_xmit", i); ethtool_sprintf(&p, "rx_queue_%u_packets", i); ethtool_sprintf(&p, "rx_queue_%u_bytes", i); ethtool_sprintf(&p, "rx_queue_%u_xdp_drop", i); + ethtool_sprintf(&p, "rx_queue_%u_xdp_redirect", i); + ethtool_sprintf(&p, "rx_queue_%u_xdp_tx", i); } for_each_present_cpu(cpu) { @@ -2057,6 +2028,7 @@ static const struct net_device_ops device_ops = { .ndo_select_queue = netvsc_select_queue, .ndo_get_stats64 = netvsc_get_stats64, .ndo_bpf = netvsc_bpf, + .ndo_xdp_xmit = netvsc_ndoxdp_xmit, }; /* -- 2.25.1