[PATCH rdma-core v1 2/6] kernel-boot: Perform device rename to make stable names

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Leon Romanovsky <leonro@xxxxxxxxxxxx>

Generalize the naming scheme for RDMA devices, so users will always
see names based on topology/GUID information. Such naming scheme has
big advantage that the names are fully automatic, fully predictable
and they stay fixed even if hardware is added or removed (i.e. no
reenumeration takes place) and that broken hardware can be replaced
seamlessly.

The name is combination of link type (Infiniband, RoCE, iWARP or OPA)
and the chosen naming policy, like NAME_KERNEL, NAME_PCI, NAME_GUID
or NAME_FALLBACK. Those naming policies are controlled by udev rule
and can be overwritten by users.

 * NAME_KERNEL - don't change names and rely on kernel assignment. This
   will keep RDMA names as before. Example: "mlx5_0".
 * NAME_PCI - read PCI location and topology as a source for stable names,
   which won't change in any software event (reset, PCI probe e.t.c.).
   Example: "ibp0s12f4".
 * NAME_GUID - read system image GUID information in simillar manner to
   net MAC naming policy. Example "rocex525400c0fe123455".
 * NAME_FALLBACK - automatic fallback: NAME_PCI->NAME_GUID->NAME_KERNEL

No doubts that new names are harder to read than the "mlx5_0" everybody,
is used to, but being consistent in scripts is much more important.

As part of this change we add special function to generate and install
in proper place UDEV binaries. Those files are expected to be in one
level above already declared general rules.d location, in default case
it will be or in /usr/lib/udev or in /lib/udev for old distributions.

Such location is not needed to be configured by users because they
already provide -DCMAKE_INSTALL_UDEV_RULESDIR if they want.

Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
---
 kernel-boot/CMakeLists.txt               |  24 ++
 kernel-boot/rdma-persistent-naming.rules |  27 +++
 kernel-boot/rdma_rename.c                | 279 +++++++++++++++++++++++
 3 files changed, 330 insertions(+)
 create mode 100644 kernel-boot/rdma-persistent-naming.rules
 create mode 100644 kernel-boot/rdma_rename.c

diff --git a/kernel-boot/CMakeLists.txt b/kernel-boot/CMakeLists.txt
index 936c953e..e40a3169 100644
--- a/kernel-boot/CMakeLists.txt
+++ b/kernel-boot/CMakeLists.txt
@@ -16,6 +16,10 @@ install(FILES
   modules/roce.conf
   DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/rdma/modules")
 
+install(FILES "rdma-persistent-naming.rules"
+  RENAME "60-rdma-persistent-naming.rules"
+  DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}")
+
 install(FILES "rdma-description.rules"
   RENAME "75-rdma-description.rules"
   DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}")
@@ -37,3 +41,23 @@ install(FILES "rdma-umad.rules"
 install(FILES "persistent-ipoib.rules"
   RENAME "70-persistent-ipoib.rules"
   DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/udev/rules.d")
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+
+# Create an installed executable (under /usr/lib/udev)
+function(rdma_udev_executable EXEC)
+  add_executable(${EXEC} ${ARGN})
+  target_link_libraries(${EXEC} LINK_PRIVATE ${COMMON_LIBS})
+  set_target_properties(${EXEC} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_BIN}")
+  install(TARGETS ${EXEC} DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}/../")
+endfunction()
+
+if (NOT NL_KIND EQUAL 0)
+  rdma_udev_executable(rdma_rename
+    rdma_rename.c
+  )
+
+  target_link_libraries(rdma_rename LINK_PRIVATE
+    ${NL_LIBRARIES}
+  )
+endif()
diff --git a/kernel-boot/rdma-persistent-naming.rules b/kernel-boot/rdma-persistent-naming.rules
new file mode 100644
index 00000000..b1a92f1c
--- /dev/null
+++ b/kernel-boot/rdma-persistent-naming.rules
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
+# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file
+#
+# Rename modes:
+# NAME_FALLBACK - Try to name devices in the following order:
+#                 by-pci -> by-guid -> kernel
+# NAME_KERNEL - leave name as kernel provided
+# NAME_PCI - based on PCI/slot/function location
+# NAME_GUID - based on system image GUID
+#
+# The stable names are combination fo device type technology and rename mode.
+# Infiniband - ib*
+# RoCE - roce*
+# iWARP - iw*
+# OPA - opa*
+#
+# Example:
+# * NAME_PCI
+#   pci = 0000:00:0c.4
+#   Device type = IB
+#   mlx5_0 -> ibp0s12f4
+# * NAME_GUID
+#   GUID = 5254:00c0:fe12:3455
+#   Device type = RoCE
+#   mlx5_0 -> rocex525400c0fe123455
+#
+ACTION=="add", SUBSYSTEM=="infiniband", PROGRAM="rdma_rename %k NAME_FALLBACK"
diff --git a/kernel-boot/rdma_rename.c b/kernel-boot/rdma_rename.c
new file mode 100644
index 00000000..15be2d3a
--- /dev/null
+++ b/kernel-boot/rdma_rename.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB)
+/* Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <rdma/rdma_netlink.h>
+#include <netlink/netlink.h>
+#include <netlink/msg.h>
+#include <netlink/attr.h>
+
+/*
+ * Rename modes:
+ * NAME_FALLBACK - Try to name devices in the following order:
+ *                 by-pci -> by-guid -> kernel
+ * NAME_KERNEL - leave name as kernel provided
+ * NAME_PCI - based on PCI/slot/function location
+ * NAME_GUID - based on system image GUID
+ *
+ * The stable names are combination fo device type technology and rename mode.
+ * Infiniband - ib*
+ * RoCE - roce*
+ * iWARP - iw*
+ * OPA - opa*
+ *
+ * Example:
+ * NAME_PCI
+ *  pci = 0000:00:0c.4
+ *  Device type = IB
+ *  mlx5_0 -> ibp0s12f4
+ * NAME_GUID
+ *  GUID = 5254:00c0:fe12:3455
+ *  Device type = RoCE
+ *  mlx5_0 -> rocex525400c0fe123455
+ */
+
+static struct nla_policy policy[RDMA_NLDEV_ATTR_MAX] = {
+	[RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+	[RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING },
+};
+
+struct data {
+	const char *curr;
+	const char *prefix;
+	uint64_t sys_image_guid;
+	char *name;
+	int idx;
+};
+
+static int by_pci(struct data *d)
+{
+	char *token, *pci, *subs;
+	char subsystem[256];
+	char devpath[256];
+	char buf[256];
+	long p, s, f;
+	ssize_t len;
+	int ret;
+
+	ret = sprintf(subsystem, "/sys/class/infiniband/%s/device/subsystem", d->curr);
+	if (ret == -1)
+		return -ENOMEM;
+
+	len = readlink(subsystem, buf, sizeof(buf)-1);
+	if (len == -1)
+		return -EINVAL;
+
+	subs = basename(buf);
+	if (strcmp(subs, "pci"))
+		/* Ball out virtual devices */
+		return -EINVAL;
+
+	/* Real devices */
+	ret = sprintf(devpath, "/sys/class/infiniband/%s/device", d->curr);
+	if (ret == -1)
+		return -ENOMEM;
+
+	len = readlink(devpath, buf, sizeof(buf)-1);
+	if (len == -1)
+		return -EINVAL;
+	pci = basename(buf);
+	/*
+	 * pci = 0000:00:0c.0
+	 */
+	ret = -1;
+	token = strtok(pci, ":");
+	token = strtok(NULL, ":");
+	p = strtol(token, NULL, 16);
+
+	token = strtok(NULL, ":");
+	s = strtol(token, NULL, 16);
+
+	token = strtok(token, ".");
+	token = strtok(NULL, ".");
+	f = strtol(token, NULL, 16);
+
+	ret = asprintf(&d->name, "%sp%lds%ldf%ld", d->prefix, p, s, f);
+	if (ret == -1)
+		d->name = NULL;
+
+	return (ret < 0) ? -ENOMEM : 0;
+}
+
+static int by_guid(struct data *d)
+{
+	uint16_t vp[4];
+	int ret = -1;
+
+	if (!d->sys_image_guid)
+		/* virtual devices start without GUID */
+		goto out;
+
+	memcpy(vp, &d->sys_image_guid, sizeof(uint64_t));
+	ret = asprintf(&d->name, "%sx%04x%04x%04x%04x", d->prefix, vp[3], vp[2],
+		       vp[1], vp[0]);
+out:
+	if (ret == -1) {
+		d->name = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int device_rename(struct nl_sock *nl, struct data *d)
+{
+	struct nlmsghdr *hdr;
+	struct nl_msg *msg;
+	int ret;
+
+	msg = nlmsg_alloc();
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SET),
+			0, 0);
+	if (!hdr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, d->idx);
+	if (ret)
+		goto out;
+	ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, d->name);
+	if (ret)
+		goto out;
+	ret = nl_send_auto(nl, msg);
+	if (ret < 0)
+		return ret;
+out:
+	nlmsg_free(msg);
+	return (ret < 0) ? ret : 0;
+}
+
+static int get_nldata_cb(struct nl_msg *msg, void *data)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+	struct nlmsghdr *hdr = nlmsg_hdr(msg);
+	struct data *d = data;
+	int ret;
+
+	ret = nlmsg_parse(hdr, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, policy);
+	if (ret < 0)
+		return NL_STOP;
+
+	if (!tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] ||
+	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE])
+		return NL_STOP;
+
+	ret = strcmp(d->curr, nla_get_string(tb[RDMA_NLDEV_ATTR_DEV_NAME]));
+	if (ret)
+		return NL_SKIP;
+
+	d->prefix = nla_get_string(tb[RDMA_NLDEV_ATTR_LINK_TYPE]);
+	d->idx = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	d->sys_image_guid = nla_get_u64(tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]);
+	return NL_OK;
+}
+
+static int get_nldata(struct nl_sock *nl, struct data *data)
+{
+	struct nl_cb *cb;
+	int ret;
+
+	ret = nl_send_simple(
+		nl, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+		NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, NULL, 0);
+	if (ret < 0)
+		return ret;
+
+	cb = nl_cb_alloc(NL_CB_DEFAULT);
+	nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, get_nldata_cb, (void *)data);
+
+	do {
+		ret = nl_recvmsgs(nl, cb);
+	} while (ret > 0);
+
+	return ret;
+}
+
+
+enum name_policy {
+	NAME_KERNEL = 1 << 0,
+	NAME_PCI = 1 << 1,
+	NAME_GUID = 1 << 2,
+	NAME_ERROR = 1 << 8
+};
+
+static int str2policy(const char *np)
+{
+	if (!strcmp(np, "NAME_KERNEL"))
+		return NAME_KERNEL;
+	if (!strcmp(np, "NAME_PCI"))
+		return NAME_PCI;
+	if (!strcmp(np, "NAME_GUID"))
+		return NAME_GUID;
+	if (!strcmp(np, "NAME_FALLBACK"))
+		return NAME_PCI | NAME_GUID;
+	return NAME_ERROR;
+};
+
+int main(int argc, const char *argv[])
+{
+	struct data d = { .idx = -1 };
+	struct nl_sock *nl;
+	int ret = -1;
+	int np;
+
+	if (argc != 3)
+		goto err;
+
+	np = str2policy(argv[2]);
+	if (np & NAME_ERROR)
+		goto err;
+
+	if (np & NAME_KERNEL)
+		/* Do nothing */
+		exit(0);
+
+	nl = nl_socket_alloc();
+	if (!nl)
+		goto err;
+
+	if (nl_connect(nl, NETLINK_RDMA))
+		goto out;
+
+	d.curr = argv[1];
+	ret = get_nldata(nl, &d);
+	if (ret || d.idx == -1)
+		goto out;
+
+	if (np & NAME_PCI)
+		ret = by_pci(&d);
+	if (ret && (np & NAME_GUID))
+		ret = by_guid(&d);
+	if (ret)
+		goto out;
+
+	ret = device_rename(nl, &d);
+	if (ret)
+		goto out;
+
+	printf("%s\n", d.name);
+	free(d.name);
+
+out:
+	nl_socket_free(nl);
+err:
+	ret = (ret) ? 1 : 0;
+	exit(ret);
+}
-- 
2.19.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux