From: Chiara Meiohas <cmeiohas@xxxxxxxxxx> Introduce a new command for RDMA event monitoring. This patch adds a new attribute "event_type" which describes the event recieved. Add a new NETLINK_RDMA multicast group and processes listening to this multicast group receive RDMA events. The event types supported are IB device registration/unregistration and net device attachment/detachment. Example output of rdma monitor and the commands which trigger the events: $ rdma monitor $ rmmod mlx5_ib [UNREGISTER] dev 3 rocep8s0f1 [UNREGISTER] dev 2 rocep8s0f0 $modprobe mlx5_ib [REGISTER] dev 4 mlx5_0 [NETDEV_ATTACH] dev 4 mlx5_0 port 1 netdev 4 eth2 [REGISTER] dev 5 mlx5_1 [NETDEV_ATTACH] dev 5 mlx5_1 port 1 netdev 5 eth3 $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev [UNREGISTER] dev 4 rocep8s0f0 [REGISTER] dev 6 mlx5_0 [NETDEV_ATTACH] dev 6 mlx5_0 port 30 netdev 4 eth2 $ echo 4 > /sys/class/net/eth2/device/sriov_numvfs [NETDEV_ATTACH] dev 6 rdmap8s0f0 port 2 netdev 7 eth4 [NETDEV_ATTACH] dev 6 rdmap8s0f0 port 3 netdev 8 eth5 [NETDEV_ATTACH] dev 6 rdmap8s0f0 port 4 netdev 9 eth6 [NETDEV_ATTACH] dev 6 rdmap8s0f0 port 5 netdev 10 eth7 [REGISTER] dev 7 mlx5_0 [NETDEV_ATTACH] dev 7 mlx5_0 port 1 netdev 11 eth8 [REGISTER] dev 8 mlx5_0 [NETDEV_ATTACH] dev 8 mlx5_0 port 1 netdev 12 eth9 [REGISTER] dev 9 mlx5_0 [NETDEV_ATTACH] dev 9 mlx5_0 port 1 netdev 13 eth10 [REGISTER] dev 10 mlx5_0 [NETDEV_ATTACH] dev 10 mlx5_0 port 1 netdev 14 eth11 $ echo 0 > /sys/class/net/eth2/device/sriov_numvfs [UNREGISTER] dev 7 rocep8s0f0v0 [UNREGISTER] dev 8 rocep8s0f0v1 [UNREGISTER] dev 9 rocep8s0f0v2 [UNREGISTER] dev 10 rocep8s0f0v3 [NETDEV_DETACH] dev 6 rdmap8s0f0 port 2 [NETDEV_DETACH] dev 6 rdmap8s0f0 port 3 [NETDEV_DETACH] dev 6 rdmap8s0f0 port 4 [NETDEV_DETACH] dev 6 rdmap8s0f0 port 5 Signed-off-by: Chiara Meiohas <cmeiohas@xxxxxxxxxx> Reviewed-by: Mark Bloch <mbloch@xxxxxxxxxx> --- include/mnl_utils.h | 1 + lib/mnl_utils.c | 5 + man/man8/rdma-monitor.8 | 51 ++++++++++ man/man8/rdma.8 | 7 +- rdma/Makefile | 3 +- rdma/monitor.c | 207 ++++++++++++++++++++++++++++++++++++++++ rdma/rdma.c | 3 +- rdma/rdma.h | 1 + rdma/utils.c | 1 + 9 files changed, 276 insertions(+), 3 deletions(-) create mode 100644 man/man8/rdma-monitor.8 create mode 100644 rdma/monitor.c diff --git a/include/mnl_utils.h b/include/mnl_utils.h index 76fe1dfe..0ddf2932 100644 --- a/include/mnl_utils.h +++ b/include/mnl_utils.h @@ -24,6 +24,7 @@ int mnlu_gen_socket_sndrcv(struct mnlu_gen_socket *nlg, const struct nlmsghdr *n mnl_cb_t data_cb, void *data); struct mnl_socket *mnlu_socket_open(int bus); +int mnl_add_nl_group(struct mnl_socket *nl, unsigned int group); struct nlmsghdr *mnlu_msg_prepare(void *buf, uint32_t nlmsg_type, uint16_t flags, void *extra_header, size_t extra_header_size); int mnlu_socket_recv_run(struct mnl_socket *nl, unsigned int seq, void *buf, size_t buf_size, diff --git a/lib/mnl_utils.c b/lib/mnl_utils.c index 6c8f527e..5f6671bf 100644 --- a/lib/mnl_utils.c +++ b/lib/mnl_utils.c @@ -35,6 +35,11 @@ err_bind: return NULL; } +int mnl_add_nl_group(struct mnl_socket *nl, unsigned int group) +{ + return mnl_socket_bind(nl, group, MNL_SOCKET_AUTOPID); +} + struct nlmsghdr *mnlu_msg_prepare(void *buf, uint32_t nlmsg_type, uint16_t flags, void *extra_header, size_t extra_header_size) { diff --git a/man/man8/rdma-monitor.8 b/man/man8/rdma-monitor.8 new file mode 100644 index 00000000..d445cba0 --- /dev/null +++ b/man/man8/rdma-monitor.8 @@ -0,0 +1,51 @@ +.TH RDMA\-MONITOR 8 "22 Jul 2024" "iproute2" "Linux" +.SH NAME +rdma-monitor \- RDMA events monitoring +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B rdma +.RI "[ " OPTIONS " ]" +.B monitor +.RI " { " help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] } + +.ti -8 +.B rdma monitor + +.ti -8 +.B rdma monitor help + +.SH "DESCRIPTION" +.SS rdma monitor - utility can monitor RDMA device events on all RDMA devices. +.PP +.B rdma +opens an RDMA Netlink socket, listens on it and dumps the event info. + +The event types supported are RDMA device registration/unregistration +and net device attachment/detachment. + +.SH "EXAMPLES" +.PP +rdma monitor +.RS 4 +Listen for events of all RDMA devices +.RE +.PP + +.SH SEE ALSO +.BR rdma (8), +.BR rdma-link (8), +.BR rdma-resource (8), +.BR rdma-system (8), +.BR rdma-statistic (8), +.br + +.SH AUTHOR +Chiara Meiohas <cmeiohas@xxxxxxxxxx> diff --git a/man/man8/rdma.8 b/man/man8/rdma.8 index 5088b9ec..df86284d 100644 --- a/man/man8/rdma.8 +++ b/man/man8/rdma.8 @@ -19,7 +19,7 @@ rdma \- RDMA tool .ti -8 .IR OBJECT " := { " -.BR dev " | " link " | " resource " | " system " | " statistic " }" +.BR dev " | " link " | " resource " | " system " | " statistic " | " monitor " }" .sp .ti -8 @@ -94,6 +94,10 @@ character. .B statistic - RDMA counter statistic related. +.TP +.B monitor +- RDMA events monitor + .PP The names of all objects may be written in full or abbreviated form, for example @@ -133,6 +137,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR rdma-resource (8), .BR rdma-system (8), .BR rdma-statistic (8), +.BR rdma-monitor (8), .br .SH REPORTING BUGS diff --git a/rdma/Makefile b/rdma/Makefile index 37d904a7..ed3c1c1c 100644 --- a/rdma/Makefile +++ b/rdma/Makefile @@ -4,7 +4,8 @@ include ../config.mk CFLAGS += -I./include/uapi/ RDMA_OBJ = rdma.o utils.o dev.o link.o res.o res-pd.o res-mr.o res-cq.o \ - res-cmid.o res-qp.o sys.o stat.o stat-mr.o res-ctx.o res-srq.o + res-cmid.o res-qp.o sys.o stat.o stat-mr.o res-ctx.o res-srq.o \ + monitor.o TARGETS += rdma diff --git a/rdma/monitor.c b/rdma/monitor.c new file mode 100644 index 00000000..8c14d575 --- /dev/null +++ b/rdma/monitor.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * monitor.c RDMA tool + * Authors: Chiara Meiohas <cmeiohas@xxxxxxxxxx> + */ + +#include "rdma.h" +#include "utils.h" + +static int mon_is_supported_cb(const struct nlmsghdr *nlh, void *data) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {}; + uint8_t *is_sup = data; + + mnl_attr_parse(nlh, 0, rd_attr_cb, tb); + if (tb[RDMA_NLDEV_SYS_ATTR_MONITOR_MODE]) + *is_sup = mnl_attr_get_u8(tb[RDMA_NLDEV_SYS_ATTR_MONITOR_MODE]); + + return MNL_CB_OK; +} + +static int mon_is_supported(struct rd *rd, uint8_t *is_sup) +{ + uint32_t seq; + int ret; + + *is_sup = 0; + rd_prepare_msg(rd, RDMA_NLDEV_CMD_SYS_GET, + &seq, (NLM_F_REQUEST | NLM_F_ACK)); + ret = rd_send_msg(rd); + if (ret) + return ret; + + return rd_recv_msg(rd, mon_is_supported_cb, is_sup, seq); +} + +static void mon_print_event_type(struct nlattr **tb) +{ + const char *const event_types_str[] = { + [RDMA_REGISTER_EVENT] = "[REGISTER]", + [RDMA_UNREGISTER_EVENT] = "[UNREGISTER]", + [RDMA_NETDEV_ATTACH_EVENT] = "[NETDEV_ATTACH]", + [RDMA_NETDEV_DETACH_EVENT] = "[NETDEV_DETACH]", + }; + enum rdma_nl_notify_event_type etype; + char unknown_type[32]; + + if (!tb[RDMA_NLDEV_ATTR_EVENT_TYPE]) + return; + + etype = mnl_attr_get_u8(tb[RDMA_NLDEV_ATTR_EVENT_TYPE]); + if (etype < ARRAY_SIZE(event_types_str) && event_types_str[etype]) { + print_string(PRINT_ANY, "event_type", "%s\t", + event_types_str[etype]); + } else { + snprintf(unknown_type, sizeof(unknown_type), "[UNKNOWN 0x%02x]", + etype); + print_string(PRINT_ANY, "event_type", "%s\t", unknown_type); + } +} + +static int mon_print_dev(struct nlattr **tb) +{ + const char *name; + uint32_t idx; + + if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + idx = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + print_uint(PRINT_ANY, "rdma_index", "dev %u", idx); + } + + if(tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]); + print_string(PRINT_ANY, "rdma_dev", " %s", name); + } + + return 0; +} + +static void mon_print_port_idx(struct nlattr **tb) +{ + uint32_t port; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + print_uint(PRINT_ANY, "port", " port %u", port); + } +} + +static void mon_print_netdev(struct nlattr **tb) +{ + uint32_t netdev_idx; + const char *name; + + if (tb[RDMA_NLDEV_ATTR_NDEV_INDEX]) { + netdev_idx = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_NDEV_INDEX]); + print_uint(PRINT_ANY, "netdev_idx", " netdev %u", netdev_idx); + } + + if(tb[RDMA_NLDEV_ATTR_NDEV_NAME]) { + name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_NDEV_NAME]); + print_string(PRINT_ANY, "netdev_name", " %s", name); + } +} + +static int mon_show_cb(const struct nlmsghdr *nlh, void *data) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX + 1] = {}; + + mnl_attr_parse(nlh, 0, rd_attr_cb, tb); + if (!tb[RDMA_NLDEV_ATTR_EVENT_TYPE]) + return MNL_CB_ERROR; + + open_json_object(NULL); + + mon_print_event_type(tb); + mon_print_dev(tb); + mon_print_port_idx(tb); + mon_print_netdev(tb); + + close_json_object(); + newline(); + fflush(stdout); + + return MNL_CB_OK; +} + +static int mon_show(struct rd* rd) +{ + unsigned int groups = 0; + uint8_t is_sup = 0; + int one = 1; + char *buf; + int err; + + err = mon_is_supported(rd, &is_sup); + if (err) { + pr_err("Failed to check if RDMA monitoring is supported\n"); + return err; + } + + if (!is_sup) { + pr_err("RDMA monitoring is not supported by the kernel\n"); + return -ENOENT; + } + + buf = malloc(MNL_SOCKET_BUFFER_SIZE); + if (!buf) { + pr_err("Buffer allocation failed\n"); + return -ENOMEM; + } + + rd->nl = mnl_socket_open(NETLINK_RDMA); + if (!rd->nl) { + pr_err("Failed to open NETLINK_RDMA socket. Error: %s\n", + strerror(errno)); + err = -ENODEV; + goto err_free; + } + mnl_socket_setsockopt(rd->nl, NETLINK_CAP_ACK, &one, sizeof(one)); + mnl_socket_setsockopt(rd->nl, NETLINK_EXT_ACK, &one, sizeof(one)); + + groups |= nl_mgrp(RDMA_NL_GROUP_NOTIFY); + + err = mnl_add_nl_group(rd->nl, groups); + if (err < 0) { + pr_err("Failed to add NETLINK_RDMA multicast group. Error: %s\n", + strerror(errno)); + goto err_close; + } + new_json_obj(json); + + err = mnlu_socket_recv_run(rd->nl, 0, buf, MNL_SOCKET_BUFFER_SIZE, + mon_show_cb, rd); + if (err) { + pr_err("Failed to listen to rdma socket\n"); + goto err_free_json; + } + + return 0; + +err_free_json: + delete_json_obj(); +err_close: + mnl_socket_close(rd->nl); +err_free: + free(buf); + return err; +} + +static int mon_help(struct rd *rd) +{ + pr_out("Usage: rdma monitor [ -j ]\n"); + return 0; +} + +int cmd_mon(struct rd *rd) +{ + const struct rd_cmd cmds[] = { + { NULL, mon_show }, + { "help", mon_help }, + { 0 } + }; + + return rd_exec_cmd(rd, cmds, "mon command"); +} + diff --git a/rdma/rdma.c b/rdma/rdma.c index 131c6b2a..253ac58b 100644 --- a/rdma/rdma.c +++ b/rdma/rdma.c @@ -15,7 +15,7 @@ static void help(char *name) { pr_out("Usage: %s [ OPTIONS ] OBJECT { COMMAND | help }\n" " %s [ -f[orce] ] -b[atch] filename\n" - "where OBJECT := { dev | link | resource | system | statistic | help }\n" + "where OBJECT := { dev | link | resource | monitor | system | statistic | help }\n" " OPTIONS := { -V[ersion] | -d[etails] | -j[son] | -p[retty] | -r[aw]}\n", name, name); } @@ -35,6 +35,7 @@ static int rd_cmd(struct rd *rd, int argc, char **argv) { "resource", cmd_res }, { "system", cmd_sys }, { "statistic", cmd_stat }, + { "monitor", cmd_mon }, { 0 } }; diff --git a/rdma/rdma.h b/rdma/rdma.h index d224ec57..fb037bcf 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -98,6 +98,7 @@ int cmd_link(struct rd *rd); int cmd_res(struct rd *rd); int cmd_sys(struct rd *rd); int cmd_stat(struct rd *rd); +int cmd_mon(struct rd* rd); int rd_exec_cmd(struct rd *rd, const struct rd_cmd *c, const char *str); int rd_exec_dev(struct rd *rd, int (*cb)(struct rd *rd)); int rd_exec_require_dev(struct rd *rd, int (*cb)(struct rd *rd)); diff --git a/rdma/utils.c b/rdma/utils.c index 4d3803b5..bc104e0f 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -477,6 +477,7 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = MNL_TYPE_U8, [RDMA_NLDEV_ATTR_DEV_TYPE] = MNL_TYPE_U8, [RDMA_NLDEV_ATTR_PARENT_NAME] = MNL_TYPE_STRING, + [RDMA_NLDEV_ATTR_EVENT_TYPE] = MNL_TYPE_U8, }; static int rd_attr_check(const struct nlattr *attr, int *typep) -- 2.44.0