From: Leon Romanovsky <leonro@xxxxxxxxxxxx> Generalize the naming scheme for RDMA devices, so users will always see names based on topology/GUID information. Such naming scheme has big advantage that the names are fully automatic, fully predictable and they stay fixed even if hardware is added or removed (i.e. no reenumeration takes place) and that broken hardware can be replaced seamlessly. The naming policy is possible to chose from NAME_KERNEL, NAME_PCI, NAME_GUID or NAME_FALLBACK, which is controlled by udev rule. * NAME_KERNEL - don't change names and rely on kernel assignment. This will keep RDMA names as before. Example: "mlx5_0". * NAME_PCI - read PCI location and topology as a source for stable names, which won't change in any software event (reset, PCI probe e.t.c.). Example: "mlxp0s12f4". * NAME_GUID - read system image GUID information in simillar manner to net MAC naming policy. Example "mlxx525400c0fe123455". * NAME_FALLBACK - automatic fallback: NAME_PCI->NAME_GUID->NAME_KERNEL No doubts that new names are harder to read than the "mlx5_0" everybody, is used to, but being consistent in scripts is much more important. As a matter of precaution, we set default naming policy to be NAME_KERNEL, but will change it later to NAME_FALLBACK. Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- kernel-boot/CMakeLists.txt | 16 ++ kernel-boot/rdma-persistent-naming.rules | 19 ++ kernel-boot/rdma_rename.c | 286 +++++++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 kernel-boot/rdma-persistent-naming.rules create mode 100644 kernel-boot/rdma_rename.c diff --git a/kernel-boot/CMakeLists.txt b/kernel-boot/CMakeLists.txt index 936c953e..cf440565 100644 --- a/kernel-boot/CMakeLists.txt +++ b/kernel-boot/CMakeLists.txt @@ -20,6 +20,10 @@ install(FILES "rdma-description.rules" RENAME "75-rdma-description.rules" DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") +install(FILES "rdma-persistent-naming.rules" + RENAME "99-rdma-persistent-naming.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + install(FILES "rdma-hw-modules.rules" RENAME "90-rdma-hw-modules.rules" DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") @@ -37,3 +41,15 @@ install(FILES "rdma-umad.rules" install(FILES "persistent-ipoib.rules" RENAME "70-persistent-ipoib.rules" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/udev/rules.d") + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + +if (NOT NL_KIND EQUAL 0) + rdma_udev_executable(rdma_rename + rdma_rename.c + ) + + target_link_libraries(rdma_rename LINK_PRIVATE + ${NL_LIBRARIES} + ) +endif() diff --git a/kernel-boot/rdma-persistent-naming.rules b/kernel-boot/rdma-persistent-naming.rules new file mode 100644 index 00000000..915328f1 --- /dev/null +++ b/kernel-boot/rdma-persistent-naming.rules @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file +# Rename modes: +# Rename modes: +# NAME_FALLBACK - Try to name devices in the following order: +# by-pci -> by-guid -> kernel +# NAME_KERNEL - leave name as kernel provided +# NAME_PCI - based on PCI/slot/function location +# NAME_GUID - based on system image GUID +# +# Example: +# * NAME_PCI +# pci = 0000:00:0c.4 +# mlx5_0 -> mlxp0s12f4 +# * NAME_GUID +# Example: +# mlx5_0 with GUID 5254:00c0:fe12:3455 -> mlxx525400c0fe123455 +# +ACTION=="add", SUBSYSTEM=="infiniband", PROGRAM="rdma_rename %k NAME_KERNEL" diff --git a/kernel-boot/rdma_rename.c b/kernel-boot/rdma_rename.c new file mode 100644 index 00000000..c7c85d86 --- /dev/null +++ b/kernel-boot/rdma_rename.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +/* Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <rdma/rdma_netlink.h> +#include <netlink/netlink.h> +#include <netlink/msg.h> +#include <netlink/attr.h> + +/* + * Rename modes: + * NAME_FALLBACK - Try to name devices in the following order: + * by-pci -> by-guid -> kernel + * NAME_KERNEL - leave name as kernel provided + * NAME_PCI - based on PCI/slot/function location + * NAME_GUID - based on system image GUID + * Example: + * mlx5_0 -> mlxp0s3f2 + * NAME_GUID - based on system image GUID + * Example: + * mlx5_0 with GUID 5254:00c0:fe12:3455 -> mlxx525400c0fe123455 + */ + +static struct nla_policy policy[RDMA_NLDEV_ATTR_MAX] = { + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, +}; + +struct data { + const char *curr; + char *prefix; + uint64_t sys_image_guid; + char *name; + int idx; +}; + +static int by_pci(struct data *d) +{ + char *path, *token, *pci; + char buf[256]; + long p, s, f; + ssize_t len; + int ret; + + ret = asprintf(&path, "/sys/class/infiniband/%s", d->curr); + if (ret == -1) { + path = NULL; + ret = -ENOMEM; + goto out; + } + + len = readlink(path, buf, sizeof(buf)-1); + if (len == -1) { + ret = -EINVAL; + goto out; + } + pci = buf + strlen("../../devices/pci0000:00/"); + pci[strlen("0000:00:0c0.0")] = '\0'; + /* + * pci = 0000:00:0c.0 + */ + ret = -1; + token = strtok(pci, ":"); + token = strtok(NULL, ":"); + p = strtol(token, NULL, 16); + + token = strtok(NULL, ":"); + s = strtol(token, NULL, 16); + + token = strtok(token, "."); + token = strtok(NULL, "."); + f = strtol(token, NULL, 16); + + ret = asprintf(&d->name, "%sp%lds%ldf%ld", d->prefix, p, s, f); + if (ret == -1) { + ret = -ENOMEM; + d->name = NULL; + goto out; + } + +out: + free(path); + return (ret < 0) ? ret : 0; +} + +static int by_guid(struct data *d) +{ + uint16_t vp[4]; + int ret = -1; + + if (!d->sys_image_guid) + /* virtual devices start without GUID */ + goto out; + + memcpy(vp, &d->sys_image_guid, sizeof(uint64_t)); + ret = asprintf(&d->name, "%sx%04x%04x%04x%04x", d->prefix, vp[3], vp[2], + vp[1], vp[0]); +out: + if (ret == -1) { + d->name = NULL; + return -ENOMEM; + } + + return 0; +} + +/* Cut initial string in device name: + * mlx5_0 -> mlx + * cxgb3_0 -> cxgb + */ +static int cut_prefix(struct data *d) +{ + char cset[] = "1234567890_-"; + char *pos; + + d->prefix = strdup(d->curr); + if (!d->prefix) + return -ENOMEM; + + pos = strpbrk(d->prefix, cset); + if (pos) + *pos = '\0'; + return 0; +} + +static int device_rename(struct nl_sock *nl, struct data *d) +{ + struct nlmsghdr *hdr; + struct nl_msg *msg; + int ret; + + msg = nlmsg_alloc(); + if (!msg) + return -ENOMEM; + + hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SET), + 0, 0); + if (!hdr) { + ret = -ENOMEM; + goto out; + } + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, d->idx); + if (ret) + goto out; + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, d->name); + if (ret) + goto out; + ret = nl_send_auto(nl, msg); + if (ret < 0) + return ret; +out: + nlmsg_free(msg); + return (ret < 0) ? ret : 0; +} + + +static int get_nldata_cb(struct nl_msg *msg, void *data) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {}; + struct nlmsghdr *hdr = nlmsg_hdr(msg); + struct data *d = data; + int ret; + + ret = nlmsg_parse(hdr, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, policy); + if (ret < 0) + return NL_STOP; + + if (!tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]) + return NL_STOP; + + ret = strcmp(d->curr, nla_get_string(tb[RDMA_NLDEV_ATTR_DEV_NAME])); + if (ret) + return NL_SKIP; + + d->idx = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + d->sys_image_guid = nla_get_u64(tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]); + return NL_OK; +} + +static int get_nldata(struct nl_sock *nl, struct data *data) +{ + struct nl_cb *cb; + int ret; + + ret = nl_send_simple( + nl, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, NULL, 0); + if (ret < 0) + return ret; + + cb = nl_cb_alloc(NL_CB_DEFAULT); + nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, get_nldata_cb, (void *)data); + + do { + ret = nl_recvmsgs(nl, cb); + } while (ret > 0); + + return ret; +} + +enum name_policy { + NAME_KERNEL = 1 << 0, + NAME_PCI = 1 << 1, + NAME_GUID = 1 << 2, + NAME_ERROR = 1 << 8 +}; + +static int str2policy(const char *np) +{ + if (!strcmp(np, "NAME_KERNEL")) + return NAME_KERNEL; + if (!strcmp(np, "NAME_PCI")) + return NAME_PCI; + if (!strcmp(np, "NAME_GUID")) + return NAME_GUID; + if (!strcmp(np, "NAME_FALLBACK")) + return NAME_PCI | NAME_GUID; + return NAME_ERROR; +}; + +int main(int argc, const char *argv[]) +{ + struct data d = { .idx = -1 }; + struct nl_sock *nl; + int ret = -1; + int np; + + if (argc != 3) + goto err; + + np = str2policy(argv[2]); + if (np & NAME_ERROR) + goto err; + + if (np & NAME_KERNEL) + /* Do nothing */ + exit(0); + + nl = nl_socket_alloc(); + if (!nl) + goto err; + + if (nl_connect(nl, NETLINK_RDMA)) + goto sock; + + d.curr = argv[1]; + ret = get_nldata(nl, &d); + if (ret || d.idx == -1) + goto sock; + + ret = cut_prefix(&d); + if (ret) + goto sock; + + if (np & NAME_PCI) + ret = by_pci(&d); + + if (ret && (np & NAME_GUID)) + ret = by_guid(&d); + if (ret) + goto out; + + ret = device_rename(nl, &d); + if (ret) + goto out; + + printf("%s\n", d.name); + free(d.name); + +out: + free(d.prefix); +sock: + nl_socket_free(nl); +err: + ret = (ret) ? 1 : 0; + exit(ret); +} -- 2.19.1