[PATCH rdma-core v2 2/6] kernel-boot: Perform device rename to make stable names

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Leon Romanovsky <leonro@xxxxxxxxxxxx>

Generalize the naming scheme for RDMA devices, so users will always
see names based on topology/GUID information. Such naming scheme has
big advantage that the names are fully automatic, fully predictable
and they stay fixed even if hardware is added or removed (i.e. no
reenumeration takes place) and that broken hardware can be replaced
seamlessly.

The name is combination of link type (Infiniband, RoCE, iWARP or OPA)
and the chosen naming policy, like NAME_KERNEL, NAME_PCI, NAME_ONBOARD,
NAME_GUID or NAME_FALLBACK. Those naming policies are controlled by udev
rule and can be overwritten by users.

 * NAME_KERNEL - don't change names and rely on kernel assignment. This
   will keep RDMA names as before. Example: "mlx5_0".
 * NAME_PCI - read PCI location and topology as a source for stable names,
   which won't change in any software event (reset, PCI probe e.t.c.).
   Example: "ibp0s12f4".
 * NAME_GUID - read system image GUID information in similar manner to
   net MAC naming policy. Example "rocex525400c0fe123455".
 * NAME_ONBOARD - read Firmware/BIOS provided index numbers for on-board devices
 * NAME_FALLBACK - automatic fallback: NAME_ONBOARD->NAME_PCI->NAME_KERNEL

No doubts that new names are harder to read than the "mlx5_0" everybody,
is used to, but being consistent in scripts is much more important.

As part of this change we add special function to generate and install
in proper place UDEV binaries. Those files are expected to be in one
level above already declared general rules.d location, in default case
it will be or in /usr/lib/udev or in /lib/udev for old distributions.

Such location is not needed to be configured by users because they
already provide -DCMAKE_INSTALL_UDEV_RULESDIR if they want.

Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
---
 kernel-boot/CMakeLists.txt               |  24 ++
 kernel-boot/rdma-persistent-naming.rules |  27 ++
 kernel-boot/rdma_rename.c                | 473 +++++++++++++++++++++++
 3 files changed, 524 insertions(+)
 create mode 100644 kernel-boot/rdma-persistent-naming.rules
 create mode 100644 kernel-boot/rdma_rename.c

diff --git a/kernel-boot/CMakeLists.txt b/kernel-boot/CMakeLists.txt
index 936c953e..e40a3169 100644
--- a/kernel-boot/CMakeLists.txt
+++ b/kernel-boot/CMakeLists.txt
@@ -16,6 +16,10 @@ install(FILES
   modules/roce.conf
   DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/rdma/modules")
 
+install(FILES "rdma-persistent-naming.rules"
+  RENAME "60-rdma-persistent-naming.rules"
+  DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}")
+
 install(FILES "rdma-description.rules"
   RENAME "75-rdma-description.rules"
   DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}")
@@ -37,3 +41,23 @@ install(FILES "rdma-umad.rules"
 install(FILES "persistent-ipoib.rules"
   RENAME "70-persistent-ipoib.rules"
   DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/udev/rules.d")
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+
+# Create an installed executable (under /usr/lib/udev)
+function(rdma_udev_executable EXEC)
+  add_executable(${EXEC} ${ARGN})
+  target_link_libraries(${EXEC} LINK_PRIVATE ${COMMON_LIBS})
+  set_target_properties(${EXEC} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_BIN}")
+  install(TARGETS ${EXEC} DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}/../")
+endfunction()
+
+if (NOT NL_KIND EQUAL 0)
+  rdma_udev_executable(rdma_rename
+    rdma_rename.c
+  )
+
+  target_link_libraries(rdma_rename LINK_PRIVATE
+    ${NL_LIBRARIES}
+  )
+endif()
diff --git a/kernel-boot/rdma-persistent-naming.rules b/kernel-boot/rdma-persistent-naming.rules
new file mode 100644
index 00000000..774b2937
--- /dev/null
+++ b/kernel-boot/rdma-persistent-naming.rules
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
+# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
+#
+# Rename modes:
+# NAME_FALLBACK - Try to name devices in the following order:
+#                 by-pci -> by-guid -> kernel
+# NAME_KERNEL - leave name as kernel provided
+# NAME_PCI - based on PCI/slot/function location
+# NAME_GUID - based on system image GUID
+#
+# The stable names are combination of device type technology and rename mode.
+# Infiniband - ib*
+# RoCE - roce*
+# iWARP - iw*
+# OPA - opa*
+#
+# Example:
+# * NAME_PCI
+#   pci = 0000:00:0c.4
+#   Device type = IB
+#   mlx5_0 -> ibp0s12f4
+# * NAME_GUID
+#   GUID = 5254:00c0:fe12:3455
+#   Device type = RoCE
+#   mlx5_0 -> rocex525400c0fe123455
+#
+ACTION=="add", SUBSYSTEM=="infiniband", PROGRAM="rdma_rename %k NAME_FALLBACK"
diff --git a/kernel-boot/rdma_rename.c b/kernel-boot/rdma_rename.c
new file mode 100644
index 00000000..c50779a7
--- /dev/null
+++ b/kernel-boot/rdma_rename.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
+/* Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <rdma/rdma_netlink.h>
+#include <netlink/netlink.h>
+#include <netlink/msg.h>
+#include <netlink/attr.h>
+#include <linux/pci_regs.h>
+
+/*
+ * Rename modes:
+ * NAME_FALLBACK - Try to name devices in the following order:
+ *                 by->onboard -> by-pci -> by-guid -> kernel
+ * NAME_KERNEL - leave name as kernel provided
+ * NAME_PCI - based on PCI/slot/function location
+ * NAME_GUID - based on system image GUID
+ * NAME_ONBOARD - based on-board device index
+ *
+ * The stable names are combination of device type technology and rename mode.
+ * Infiniband - ib*
+ * RoCE - roce*
+ * iWARP - iw*
+ * OPA - opa*
+ *
+ * Example:
+ * NAME_PCI
+ *  pci = 0000:00:0c.4
+ *  Device type = IB
+ *  mlx5_0 -> ibp0s12f4
+ * NAME_GUID
+ *  GUID = 5254:00c0:fe12:3455
+ *  Device type = RoCE
+ *  mlx5_0 -> rocex525400c0fe123455
+ * NAME_ONBOARD
+ *  Index = 3
+ *  Device type = OPA
+ *  hfi1_1 -> opao3
+ */
+
+static struct nla_policy policy[RDMA_NLDEV_ATTR_MAX] = {
+	[RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+	[RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING },
+};
+
+struct data {
+	const char *curr;
+	const char *prefix;
+	uint64_t sys_image_guid;
+	char *name;
+	int idx;
+};
+
+#define ONBOARD_INDEX_MAX (16*1024-1)
+static int by_onboard(struct data *d)
+{
+	char acpi[256], index[256];
+	unsigned int o;
+	FILE *fp;
+	int ret;
+
+	/*
+	 * ACPI_DSM - device specific method for naming
+	 * PCI or PCI Express device
+	 */
+	ret = sprintf(acpi, "/sys/class/infiniband/%s/device/acpi_index",
+		      d->curr);
+	if (ret == -1)
+		return -ENOMEM;
+
+	/* SMBIOS type 41 - Onboard Devices Extended Information */
+	ret = sprintf(index, "/sys/class/infiniband/%s/device/index", d->curr);
+	if (ret == -1)
+		return -ENOMEM;
+
+	fp = fopen(acpi, "r");
+	if (!fp)
+		fp = fopen(index, "r");
+	if (!fp)
+		return -ENOENT;
+
+	ret = fscanf(fp, "%u", &o);
+	fclose(fp);
+	/* https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L263 */
+	if (!ret || o > ONBOARD_INDEX_MAX)
+		return -ENOENT;
+
+	ret = asprintf(&d->name, "%so%u", d->prefix, o);
+	if (ret == -1)
+		d->name = NULL;
+
+	return (ret < 0) ? -ENOMEM : 0;
+}
+
+static int find_sun(char *devname, char *pci)
+{
+	char bof[256], tmp[256];
+	struct dirent *dent;
+	char slots[1024];
+	DIR *dir;
+	int ret;
+
+	ret = sprintf(slots, "%s/subsystem/slots", devname);
+	if (ret == -1)
+		return 0;
+
+	dir = opendir(slots);
+	if (!dir)
+		return 0;
+
+	ret = 0;
+	if (sscanf(pci, "%s.%s", bof, tmp) != 2)
+		goto out;
+
+	while ((dent = readdir(dir))) {
+		char str[2048], address[256];
+		FILE *fp;
+		int i;
+
+		if (dent->d_name[0] == '.')
+			continue;
+		i = atoi(dent->d_name);
+		if (i <= 0)
+			continue;
+
+		ret = sprintf(str, "%s/%s/address", slots, dent->d_name);
+		if (ret == -1) {
+			ret = 0;
+			goto out;
+		}
+
+		fp = fopen(str, "r");
+		if (!fp) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = fscanf(fp, "%s", address);
+		fclose(fp);
+
+		if (ret != 1) {
+			ret = 0;
+			goto out;
+		}
+
+		if (!strcmp(bof, address)) {
+			ret = i;
+			break;
+		}
+	}
+out:
+	closedir(dir);
+	return ret;
+}
+
+static int is_pci_multifunction(char *devname)
+{
+	char config[512];
+	char c[64] = {};
+	FILE *fp;
+	int ret;
+
+	ret = sprintf(config, "%s/ari_enabled", devname);
+	if (ret == -1)
+		return 0;
+
+	fp = fopen(config, "r");
+	if (!fp)
+		return 0;
+
+	ret = fread(c, 1, sizeof(c), fp);
+	fclose(fp);
+	if (ret != sizeof(c))
+		return 0;
+
+	/* bit 0-6 header type, bit 7 multi/single function device */
+	return c[PCI_HEADER_TYPE] & 0x80;
+}
+
+static int is_pci_ari_enabled(char *devname)
+{
+	char ari[512];
+	int ret, a;
+	FILE *fp;
+
+	ret = sprintf(ari, "%s/ari_enabled", devname);
+	if (ret == -1)
+		return 0;
+
+	fp = fopen(ari, "r");
+	if (!fp)
+		return 0;
+
+	ret = fscanf(fp, "%d", &a);
+	fclose(fp);
+	return (ret) ? a == 1 : 0;
+}
+
+static int by_pci(struct data *d)
+{
+	long domain, bus, slot, func, sun;
+	char subsystem[256];
+	char buf[256] = {};
+	char devpath[256];
+	char *pci, *subs;
+	int ret;
+
+	ret = sprintf(subsystem, "/sys/class/infiniband/%s/device/subsystem",
+		      d->curr);
+	if (ret == -1)
+		return -ENOMEM;
+
+	ret = readlink(subsystem, buf, sizeof(buf)-1);
+	if (ret == -1)
+		return -EINVAL;
+
+	subs = basename(buf);
+	if (strcmp(subs, "pci"))
+		/* Ball out virtual devices */
+		return -EINVAL;
+
+	/* Real devices */
+	ret = sprintf(devpath, "/sys/class/infiniband/%s/device", d->curr);
+	if (ret == -1)
+		return -ENOMEM;
+
+	ret = readlink(devpath, buf, sizeof(buf)-1);
+	if (ret == -1)
+		return -EINVAL;
+	pci = basename(buf);
+	/*
+	 * pci = 0000:00:0c.0
+	 */
+	if (sscanf(pci, "%lx:%lx:%lx.%lu", &domain, &bus, &slot, &func) != 4)
+		return -ENOENT;
+
+	if (is_pci_ari_enabled(devpath))
+		/*
+		 * ARI devices support up to 256 functions on a single device
+		 * ("slot"), and interpret the traditional 5-bit slot and 3-bit
+		 * function number as a single 8-bit function number, where the
+		 * slot makes up the upper 5 bits.
+		 *
+		 * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L344
+		 */
+		func += slot * 8;
+
+	d->name = calloc(256, sizeof(char));
+	if (!d->name)
+		return -ENOMEM;
+
+	ret = sprintf(d->name, "%s", d->prefix);
+	if (ret == -1)
+		goto out;
+
+	if (domain > 0) {
+		ret = sprintf(buf, "P%ld", domain);
+		if (ret == -1)
+			goto out;
+		strcat(d->name, buf);
+	}
+
+	sun = find_sun(devpath, pci);
+	if (sun > 0)
+		ret = sprintf(buf, "s%ld", sun);
+	else
+		ret = sprintf(buf, "p%lds%ld", bus, slot);
+	if (ret == -1)
+		goto out;
+
+	strcat(d->name, buf);
+
+	if (func > 0 || is_pci_multifunction(devpath)) {
+		ret = sprintf(buf, "f%ld", func);
+		if (ret == -1)
+			goto out;
+		strcat(d->name, buf);
+	}
+out:
+	if (ret == -1) {
+		free(d->name);
+		d->name = NULL;
+	}
+
+	return (ret < 0) ? -ENOMEM : 0;
+}
+
+static int by_guid(struct data *d)
+{
+	uint16_t vp[4];
+	int ret = -1;
+
+	if (!d->sys_image_guid)
+		/* virtual devices start without GUID */
+		goto out;
+
+	memcpy(vp, &d->sys_image_guid, sizeof(uint64_t));
+	ret = asprintf(&d->name, "%sx%04x%04x%04x%04x", d->prefix, vp[3], vp[2],
+		       vp[1], vp[0]);
+out:
+	if (ret == -1) {
+		d->name = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int device_rename(struct nl_sock *nl, struct data *d)
+{
+	struct nlmsghdr *hdr;
+	struct nl_msg *msg;
+	int ret;
+
+	msg = nlmsg_alloc();
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SET),
+			0, 0);
+	if (!hdr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, d->idx);
+	if (ret)
+		goto out;
+	ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, d->name);
+	if (ret)
+		goto out;
+	ret = nl_send_auto(nl, msg);
+	if (ret < 0)
+		return ret;
+out:
+	nlmsg_free(msg);
+	return (ret < 0) ? ret : 0;
+}
+
+static int get_nldata_cb(struct nl_msg *msg, void *data)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	struct nlmsghdr *hdr = nlmsg_hdr(msg);
+	struct data *d = data;
+	int ret;
+
+	ret = nlmsg_parse(hdr, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, policy);
+	if (ret < 0)
+		return NL_STOP;
+
+	if (!tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] ||
+	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE])
+		return NL_STOP;
+
+	ret = strcmp(d->curr, nla_get_string(tb[RDMA_NLDEV_ATTR_DEV_NAME]));
+	if (ret)
+		return NL_SKIP;
+
+	d->prefix = nla_get_string(tb[RDMA_NLDEV_ATTR_LINK_TYPE]);
+	d->idx = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	d->sys_image_guid = nla_get_u64(tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]);
+	return NL_OK;
+}
+
+static int get_nldata(struct nl_sock *nl, struct data *data)
+{
+	struct nl_cb *cb;
+	int ret;
+
+	ret = nl_send_simple(
+		nl, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+		NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, NULL, 0);
+	if (ret < 0)
+		return ret;
+
+	cb = nl_cb_alloc(NL_CB_DEFAULT);
+	nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, get_nldata_cb, (void *)data);
+
+	do {
+		ret = nl_recvmsgs(nl, cb);
+	} while (ret > 0);
+
+	return ret;
+}
+
+
+enum name_policy {
+	NAME_KERNEL = 1 << 0,
+	NAME_PCI = 1 << 1,
+	NAME_GUID = 1 << 2,
+	NAME_ONBOARD = 1 << 3,
+	NAME_ERROR = 1 << 8
+};
+
+static int str2policy(const char *np)
+{
+	if (!strcmp(np, "NAME_KERNEL"))
+		return NAME_KERNEL;
+	if (!strcmp(np, "NAME_PCI"))
+		return NAME_PCI;
+	if (!strcmp(np, "NAME_GUID"))
+		return NAME_GUID;
+	if (!strcmp(np, "NAME_ONBOARD"))
+		return NAME_ONBOARD;
+	if (!strcmp(np, "NAME_FALLBACK"))
+		return NAME_ONBOARD | NAME_PCI;
+	return NAME_ERROR;
+};
+
+int main(int argc, const char *argv[])
+{
+	struct data d = { .idx = -1 };
+	struct nl_sock *nl;
+	int ret = -1;
+	int np;
+
+	if (argc != 3)
+		goto err;
+
+	np = str2policy(argv[2]);
+	if (np & NAME_ERROR)
+		goto err;
+
+	if (np & NAME_KERNEL)
+		/* Do nothing */
+		exit(0);
+
+	nl = nl_socket_alloc();
+	if (!nl)
+		goto err;
+
+	if (nl_connect(nl, NETLINK_RDMA))
+		goto out;
+
+	d.curr = argv[1];
+	ret = get_nldata(nl, &d);
+	if (ret || d.idx == -1)
+		goto out;
+
+	ret = -1;
+	if (np & NAME_ONBOARD)
+		ret = by_onboard(&d);
+	if (ret && (np & NAME_PCI))
+		ret = by_pci(&d);
+	if (ret && (np & NAME_GUID))
+		ret = by_guid(&d);
+	if (ret)
+		goto out;
+
+	ret = device_rename(nl, &d);
+	if (ret)
+		goto out;
+
+	printf("%s\n", d.name);
+	free(d.name);
+
+out:
+	nl_socket_free(nl);
+err:
+	ret = (ret) ? 1 : 0;
+	exit(ret);
+}
-- 
2.20.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux