From: Leon Romanovsky <leonro@xxxxxxxxxxxx> Generalize the naming scheme for RDMA devices, so users will always see names based on topology/GUID information. Such naming scheme has big advantage that the names are fully automatic, fully predictable and they stay fixed even if hardware is added or removed (i.e. no reenumeration takes place) and that broken hardware can be replaced seamlessly. The name is combination of link type (Infiniband, RoCE, iWARP or OPA) and the chosen naming policy, like NAME_KERNEL, NAME_PCI, NAME_GUID or NAME_FALLBACK. Those naming policies are controlled by udev rule and can be overwritten by users. * NAME_KERNEL - don't change names and rely on kernel assignment. This will keep RDMA names as before. Example: "mlx5_0". * NAME_PCI - read PCI location and topology as a source for stable names, which won't change in any software event (reset, PCI probe e.t.c.). Example: "ibp0s12f4". * NAME_GUID - read system image GUID information in simillar manner to net MAC naming policy. Example "rocex525400c0fe123455". * NAME_FALLBACK - automatic fallback: NAME_PCI->NAME_GUID->NAME_KERNEL No doubts that new names are harder to read than the "mlx5_0" everybody, is used to, but being consistent in scripts is much more important. As part of this change we add special function to generate and install in proper place UDEV binaries. Those files are expected to be in one level above already declared general rules.d location, in default case it will be or in /usr/lib/udev or in /lib/udev for old distributions. Such location is not needed to be configured by users because they already provide -DCMAKE_INSTALL_UDEV_RULESDIR if they want. Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- kernel-boot/CMakeLists.txt | 24 ++ kernel-boot/rdma-persistent-naming.rules | 27 +++ kernel-boot/rdma_rename.c | 279 +++++++++++++++++++++++ 3 files changed, 330 insertions(+) create mode 100644 kernel-boot/rdma-persistent-naming.rules create mode 100644 kernel-boot/rdma_rename.c diff --git a/kernel-boot/CMakeLists.txt b/kernel-boot/CMakeLists.txt index 936c953e..e40a3169 100644 --- a/kernel-boot/CMakeLists.txt +++ b/kernel-boot/CMakeLists.txt @@ -16,6 +16,10 @@ install(FILES modules/roce.conf DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/rdma/modules") +install(FILES "rdma-persistent-naming.rules" + RENAME "60-rdma-persistent-naming.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + install(FILES "rdma-description.rules" RENAME "75-rdma-description.rules" DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") @@ -37,3 +41,23 @@ install(FILES "rdma-umad.rules" install(FILES "persistent-ipoib.rules" RENAME "70-persistent-ipoib.rules" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/udev/rules.d") + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + +# Create an installed executable (under /usr/lib/udev) +function(rdma_udev_executable EXEC) + add_executable(${EXEC} ${ARGN}) + target_link_libraries(${EXEC} LINK_PRIVATE ${COMMON_LIBS}) + set_target_properties(${EXEC} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_BIN}") + install(TARGETS ${EXEC} DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}/../") +endfunction() + +if (NOT NL_KIND EQUAL 0) + rdma_udev_executable(rdma_rename + rdma_rename.c + ) + + target_link_libraries(rdma_rename LINK_PRIVATE + ${NL_LIBRARIES} + ) +endif() diff --git a/kernel-boot/rdma-persistent-naming.rules b/kernel-boot/rdma-persistent-naming.rules new file mode 100644 index 00000000..b1a92f1c --- /dev/null +++ b/kernel-boot/rdma-persistent-naming.rules @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file +# +# Rename modes: +# NAME_FALLBACK - Try to name devices in the following order: +# by-pci -> by-guid -> kernel +# NAME_KERNEL - leave name as kernel provided +# NAME_PCI - based on PCI/slot/function location +# NAME_GUID - based on system image GUID +# +# The stable names are combination fo device type technology and rename mode. +# Infiniband - ib* +# RoCE - roce* +# iWARP - iw* +# OPA - opa* +# +# Example: +# * NAME_PCI +# pci = 0000:00:0c.4 +# Device type = IB +# mlx5_0 -> ibp0s12f4 +# * NAME_GUID +# GUID = 5254:00c0:fe12:3455 +# Device type = RoCE +# mlx5_0 -> rocex525400c0fe123455 +# +ACTION=="add", SUBSYSTEM=="infiniband", PROGRAM="rdma_rename %k NAME_FALLBACK" diff --git a/kernel-boot/rdma_rename.c b/kernel-boot/rdma_rename.c new file mode 100644 index 00000000..15be2d3a --- /dev/null +++ b/kernel-boot/rdma_rename.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +/* Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <rdma/rdma_netlink.h> +#include <netlink/netlink.h> +#include <netlink/msg.h> +#include <netlink/attr.h> + +/* + * Rename modes: + * NAME_FALLBACK - Try to name devices in the following order: + * by-pci -> by-guid -> kernel + * NAME_KERNEL - leave name as kernel provided + * NAME_PCI - based on PCI/slot/function location + * NAME_GUID - based on system image GUID + * + * The stable names are combination fo device type technology and rename mode. + * Infiniband - ib* + * RoCE - roce* + * iWARP - iw* + * OPA - opa* + * + * Example: + * NAME_PCI + * pci = 0000:00:0c.4 + * Device type = IB + * mlx5_0 -> ibp0s12f4 + * NAME_GUID + * GUID = 5254:00c0:fe12:3455 + * Device type = RoCE + * mlx5_0 -> rocex525400c0fe123455 + */ + +static struct nla_policy policy[RDMA_NLDEV_ATTR_MAX] = { + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING }, +}; + +struct data { + const char *curr; + const char *prefix; + uint64_t sys_image_guid; + char *name; + int idx; +}; + +static int by_pci(struct data *d) +{ + char *token, *pci, *subs; + char subsystem[256]; + char devpath[256]; + char buf[256]; + long p, s, f; + ssize_t len; + int ret; + + ret = sprintf(subsystem, "/sys/class/infiniband/%s/device/subsystem", d->curr); + if (ret == -1) + return -ENOMEM; + + len = readlink(subsystem, buf, sizeof(buf)-1); + if (len == -1) + return -EINVAL; + + subs = basename(buf); + if (strcmp(subs, "pci")) + /* Ball out virtual devices */ + return -EINVAL; + + /* Real devices */ + ret = sprintf(devpath, "/sys/class/infiniband/%s/device", d->curr); + if (ret == -1) + return -ENOMEM; + + len = readlink(devpath, buf, sizeof(buf)-1); + if (len == -1) + return -EINVAL; + pci = basename(buf); + /* + * pci = 0000:00:0c.0 + */ + ret = -1; + token = strtok(pci, ":"); + token = strtok(NULL, ":"); + p = strtol(token, NULL, 16); + + token = strtok(NULL, ":"); + s = strtol(token, NULL, 16); + + token = strtok(token, "."); + token = strtok(NULL, "."); + f = strtol(token, NULL, 16); + + ret = asprintf(&d->name, "%sp%lds%ldf%ld", d->prefix, p, s, f); + if (ret == -1) + d->name = NULL; + + return (ret < 0) ? -ENOMEM : 0; +} + +static int by_guid(struct data *d) +{ + uint16_t vp[4]; + int ret = -1; + + if (!d->sys_image_guid) + /* virtual devices start without GUID */ + goto out; + + memcpy(vp, &d->sys_image_guid, sizeof(uint64_t)); + ret = asprintf(&d->name, "%sx%04x%04x%04x%04x", d->prefix, vp[3], vp[2], + vp[1], vp[0]); +out: + if (ret == -1) { + d->name = NULL; + return -ENOMEM; + } + + return 0; +} + +static int device_rename(struct nl_sock *nl, struct data *d) +{ + struct nlmsghdr *hdr; + struct nl_msg *msg; + int ret; + + msg = nlmsg_alloc(); + if (!msg) + return -ENOMEM; + + hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SET), + 0, 0); + if (!hdr) { + ret = -ENOMEM; + goto out; + } + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, d->idx); + if (ret) + goto out; + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, d->name); + if (ret) + goto out; + ret = nl_send_auto(nl, msg); + if (ret < 0) + return ret; +out: + nlmsg_free(msg); + return (ret < 0) ? ret : 0; +} + +static int get_nldata_cb(struct nl_msg *msg, void *data) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {}; + struct nlmsghdr *hdr = nlmsg_hdr(msg); + struct data *d = data; + int ret; + + ret = nlmsg_parse(hdr, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, policy); + if (ret < 0) + return NL_STOP; + + if (!tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE]) + return NL_STOP; + + ret = strcmp(d->curr, nla_get_string(tb[RDMA_NLDEV_ATTR_DEV_NAME])); + if (ret) + return NL_SKIP; + + d->prefix = nla_get_string(tb[RDMA_NLDEV_ATTR_LINK_TYPE]); + d->idx = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + d->sys_image_guid = nla_get_u64(tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]); + return NL_OK; +} + +static int get_nldata(struct nl_sock *nl, struct data *data) +{ + struct nl_cb *cb; + int ret; + + ret = nl_send_simple( + nl, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, NULL, 0); + if (ret < 0) + return ret; + + cb = nl_cb_alloc(NL_CB_DEFAULT); + nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, get_nldata_cb, (void *)data); + + do { + ret = nl_recvmsgs(nl, cb); + } while (ret > 0); + + return ret; +} + + +enum name_policy { + NAME_KERNEL = 1 << 0, + NAME_PCI = 1 << 1, + NAME_GUID = 1 << 2, + NAME_ERROR = 1 << 8 +}; + +static int str2policy(const char *np) +{ + if (!strcmp(np, "NAME_KERNEL")) + return NAME_KERNEL; + if (!strcmp(np, "NAME_PCI")) + return NAME_PCI; + if (!strcmp(np, "NAME_GUID")) + return NAME_GUID; + if (!strcmp(np, "NAME_FALLBACK")) + return NAME_PCI | NAME_GUID; + return NAME_ERROR; +}; + +int main(int argc, const char *argv[]) +{ + struct data d = { .idx = -1 }; + struct nl_sock *nl; + int ret = -1; + int np; + + if (argc != 3) + goto err; + + np = str2policy(argv[2]); + if (np & NAME_ERROR) + goto err; + + if (np & NAME_KERNEL) + /* Do nothing */ + exit(0); + + nl = nl_socket_alloc(); + if (!nl) + goto err; + + if (nl_connect(nl, NETLINK_RDMA)) + goto out; + + d.curr = argv[1]; + ret = get_nldata(nl, &d); + if (ret || d.idx == -1) + goto out; + + if (np & NAME_PCI) + ret = by_pci(&d); + if (ret && (np & NAME_GUID)) + ret = by_guid(&d); + if (ret) + goto out; + + ret = device_rename(nl, &d); + if (ret) + goto out; + + printf("%s\n", d.name); + free(d.name); + +out: + nl_socket_free(nl); +err: + ret = (ret) ? 1 : 0; + exit(ret); +} -- 2.19.1