Adding functionality to libvirt that will allow querying the interface for the availability of switchdev Offloading NIC capabilities. The switchdev mode was introduced in kernel 4.8, the iproute2-devlink command to retrieve the swtichdev NIC feature, Command example: devlink dev eswitch show pci/0000:03:00.0 This feature is needed for Openstack so we can do a scheduling decision if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode. And select the appropriate hypervisors with the requested capability see [1]. [1] - https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html --- configure.ac | 13 ++ docs/formatnode.html.in | 1 + src/util/virnetdev.c | 187 +++++++++++++++++++++- src/util/virnetdev.h | 1 + tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml | 1 + tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml | 1 + 6 files changed, 203 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index b12b7fa..c089798 100644 --- a/configure.ac +++ b/configure.ac @@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then AC_CHECK_HEADERS([linux/btrfs.h]) fi +dnl +dnl check for kernel headers required by devlink +dnl +if test "$with_linux" = "yes"; then + AC_CHECK_HEADERS([linux/devlink.h]) + AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, DEVLINK_ESWITCH_MODE_SWITCHDEV], + [AC_DEFINE([HAVE_DECL_DEVLINK], + [1], + [whether devlink declarations is available])], + [], + [[#include <linux/devlink.h>]]) +fi + dnl Allow perl/python overrides AC_PATH_PROGS([PYTHON], [python2 python]) if test -z "$PYTHON"; then diff --git a/docs/formatnode.html.in b/docs/formatnode.html.in index 4d935b5..29244a8 100644 --- a/docs/formatnode.html.in +++ b/docs/formatnode.html.in @@ -227,6 +227,7 @@ <dt><code>rxhash</code></dt><dd>receive-hashing</dd> <dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd> <dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd> + <dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd> </dl> </dd> <dt><code>capability</code></dt> diff --git a/src/util/virnetdev.c b/src/util/virnetdev.c index 51a6e42..fc7c961 100644 --- a/src/util/virnetdev.c +++ b/src/util/virnetdev.c @@ -59,6 +59,10 @@ # include <net/if_dl.h> #endif +#if HAVE_DECL_DEVLINK +# include <linux/devlink.h> +#endif + #ifndef IFNAMSIZ # define IFNAMSIZ 16 #endif @@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature, "ntuple", "rxhash", "rdma", - "txudptnl") + "txudptnl", + "switchdev") #ifdef __linux__ int @@ -2936,6 +2941,7 @@ int virNetDevGetRxFilter(const char *ifname, return ret; } + #if defined(SIOCETHTOOL) && defined(HAVE_STRUCT_IFREQ) /** @@ -3115,6 +3121,182 @@ virNetDevGetEthtoolFeatures(virBitmapPtr bitmap, } +#if HAVE_DECL_DEVLINK +/** + * virNetDevPutExtraHeader + * reserve and prepare room for an extra header + * This function sets to zero the room that is required to put the extra + * header after the initial Netlink header. This function also increases + * the nlmsg_len field. + * + * @nlh: pointer to Netlink header + * @size: size of the extra header that we want to put + * + * Returns pointer to the start of the extended header + */ +static void * +virNetDevPutExtraHeader(struct nlmsghdr *nlh, + size_t size) +{ + char *ptr = (char *)nlh + nlh->nlmsg_len; + size_t len = NLMSG_ALIGN(size); + nlh->nlmsg_len += len; + memset(ptr, 0, len); + return ptr; +} + + +/** + * virNetDevGetFamilyId: + * This function supplies the devlink family id + * + * @family_name: the name of the family to query + * + * Returns family id or 0 on failure. + */ +static uint32_t +virNetDevGetFamilyId(const char *family_name) +{ + struct nl_msg *nl_msg = NULL; + struct nlmsghdr *resp = NULL; + struct genlmsghdr* gmsgh = NULL; + struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, }; + unsigned int recvbuflen; + uint32_t family_id = 0; + + if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL, + NLM_F_REQUEST | NLM_F_ACK))) { + virReportOOMError(); + goto cleanup; + } + + if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr)))) + goto cleanup; + + gmsgh->cmd = CTRL_CMD_GETFAMILY; + gmsgh->version = DEVLINK_GENL_VERSION; + + if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + goto cleanup; + } + + if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0) + goto cleanup; + + if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("malformed netlink response message")); + goto cleanup; + } + + if (tb[CTRL_ATTR_FAMILY_ID] == NULL) + goto cleanup; + + family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]); + + cleanup: + nlmsg_free(nl_msg); + VIR_FREE(resp); + return family_id; +} + + +/** + * virNetDevSwitchdevFeature + * This function checks for the availability of Switchdev feature + * and add it to bitmap + * + * @ifname: name of the interface + * @out: add Switchdev feature if exist to bitmap + * + * Returns 0 on success, -1 on failure. + */ +static int +virNetDevSwitchdevFeature(const char *ifname, + virBitmapPtr *out) +{ + struct nl_msg *nl_msg = NULL; + struct nlmsghdr *resp = NULL; + unsigned int recvbuflen; + struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, }; + virPCIDevicePtr pci_device_ptr = NULL; + struct genlmsghdr* gmsgh = NULL; + const char *pci_name; + char *pfname = NULL; + int is_vf = -1; + int ret = -1; + uint32_t family_id; + + if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0) + return ret; + + if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0) + return ret; + + if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0) + goto cleanup; + + if (!(nl_msg = nlmsg_alloc_simple(family_id, + NLM_F_REQUEST | NLM_F_ACK))) { + virReportOOMError(); + goto cleanup; + } + + if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr)))) + goto cleanup; + + gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET; + gmsgh->version = DEVLINK_GENL_VERSION; + + pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) : + virNetDevGetPCIDevice(ifname); + if (pci_device_ptr == NULL) + goto cleanup; + + pci_name = virPCIDeviceGetName(pci_device_ptr); + + if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1, "pci") < 0 || + nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("allocated netlink buffer is too small")); + goto cleanup; + } + + if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0) + goto cleanup; + + if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("malformed netlink response message")); + goto cleanup; + } + + if (tb[DEVLINK_ATTR_ESWITCH_MODE] && + *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) == DEVLINK_ESWITCH_MODE_SWITCHDEV) { + ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV)); + } + + ret = 0; + + cleanup: + nlmsg_free(nl_msg); + virPCIDeviceFree(pci_device_ptr); + VIR_FREE(resp); + VIR_FREE(pfname); + return ret; +} +#else +static int +virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED, + virBitmapPtr *out ATTRIBUTE_UNUSED) +{ + return 0; +} +#endif + + # if HAVE_DECL_ETHTOOL_GFEATURES /** * virNetDevGFeatureAvailable @@ -3315,6 +3497,9 @@ virNetDevGetFeatures(const char *ifname, if (virNetDevRDMAFeature(ifname, out) < 0) goto cleanup; + if (virNetDevSwitchdevFeature(ifname, out) < 0) + goto cleanup; + ret = 0; cleanup: VIR_FORCE_CLOSE(fd); diff --git a/src/util/virnetdev.h b/src/util/virnetdev.h index 9205c0e..71eaf45 100644 --- a/src/util/virnetdev.h +++ b/src/util/virnetdev.h @@ -112,6 +112,7 @@ typedef enum { VIR_NET_DEV_FEAT_RXHASH, VIR_NET_DEV_FEAT_RDMA, VIR_NET_DEV_FEAT_TXUDPTNL, + VIR_NET_DEV_FEAT_SWITCHDEV, VIR_NET_DEV_FEAT_LAST } virNetDevFeature; diff --git a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml index d4c96e8..88252e6 100644 --- a/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml +++ b/tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml @@ -15,6 +15,7 @@ <feature name='rxhash'/> <feature name='rdma'/> <feature name='txudptnl'/> + <feature name='switchdev'/> <capability type='80211'/> </capability> </device> diff --git a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml index 71bf90e..f77dfcc 100644 --- a/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml +++ b/tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml @@ -15,6 +15,7 @@ <feature name='rxhash'/> <feature name='rdma'/> <feature name='txudptnl'/> + <feature name='switchdev'/> <capability type='80203'/> </capability> </device> -- 2.1.4 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list