From: Muneendra <muneendra.kumar@xxxxxxxxxxxx> This patch adds the following: 1. librfc provider library for rdma-core, which acts as user level interface for rdma_rfc kernel module. 2. rfc_cfg utility, which helps in loading and configuring the rdma_rfc Kernel module. This patch is inspired from librxe which provides the library for Soft RoCE kernel module. The Corresponding kernel module(rdma_rfc) changes has been sent for review and the details are below. https://marc.info/?l=linux-rdma&m=152404459816049&w=2 Signed-off-by: Muneendra <muneendra.kumar@xxxxxxxxxxxx> --- CMakeLists.txt | 2 + kernel-headers/CMakeLists.txt | 2 + kernel-headers/rdma/rdma_user_rfc.h | 179 +++++++ providers/rfc/CMakeLists.txt | 8 + providers/rfc/man/CMakeLists.txt | 4 + providers/rfc/man/rfc.7 | 77 +++ providers/rfc/man/rfc_cfg.8 | 70 +++ providers/rfc/rfc-abi.h | 53 +++ providers/rfc/rfc.c | 926 ++++++++++++++++++++++++++++++++++++ providers/rfc/rfc.h | 129 +++++ providers/rfc/rfc_cfg.in | 674 ++++++++++++++++++++++++++ providers/rfc/rfc_queue.h | 128 +++++ 12 files changed, 2252 insertions(+) create mode 100644 kernel-headers/rdma/rdma_user_rfc.h create mode 100644 providers/rfc/CMakeLists.txt create mode 100644 providers/rfc/man/CMakeLists.txt create mode 100644 providers/rfc/man/rfc.7 create mode 100644 providers/rfc/man/rfc_cfg.8 create mode 100644 providers/rfc/rfc-abi.h create mode 100644 providers/rfc/rfc.c create mode 100644 providers/rfc/rfc.h create mode 100755 providers/rfc/rfc_cfg.in create mode 100644 providers/rfc/rfc_queue.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 10a687c..0256bbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -502,6 +502,8 @@ add_subdirectory(providers/hfi1verbs) add_subdirectory(providers/ipathverbs) add_subdirectory(providers/rxe) add_subdirectory(providers/rxe/man) +add_subdirectory(providers/rfc) +add_subdirectory(providers/rfc/man) # Binaries add_subdirectory(ibacm) # NO SPARSE diff --git a/kernel-headers/CMakeLists.txt b/kernel-headers/CMakeLists.txt index 3a526b9..5d280e4 100644 --- a/kernel-headers/CMakeLists.txt +++ b/kernel-headers/CMakeLists.txt @@ -22,6 +22,7 @@ publish_internal_headers(rdma rdma/rdma_user_ioctl.h rdma/rdma_user_ioctl_cmds.h rdma/rdma_user_rxe.h + rdma/rdma_user_rfc.h rdma/vmw_pvrdma-abi.h ) @@ -69,6 +70,7 @@ rdma_kernel_provider_abi( rdma/ocrdma-abi.h rdma/qedr-abi.h rdma/rdma_user_rxe.h + rdma/rdma_user_rfc.h rdma/vmw_pvrdma-abi.h ) diff --git a/kernel-headers/rdma/rdma_user_rfc.h b/kernel-headers/rdma/rdma_user_rfc.h new file mode 100644 index 0000000..8c6b10d --- /dev/null +++ b/kernel-headers/rdma/rdma_user_rfc.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_RXE_H +#define RDMA_USER_RXE_H + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> + +union rfc_gid { + __u8 raw[16]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +}; + +struct rfc_global_route { + union rfc_gid dgid; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; +}; + +struct rfc_av { + __u8 port_num; + __u8 network_type; + __u16 reserved1; + __u32 reserved2; + struct rfc_global_route grh; + union { + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; +}; + +struct rfc_send_wr { + __aligned_u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + union { + __be32 imm_data; + __u32 invalidate_rkey; + } ex; + union { + struct { + __aligned_u64 remote_addr; + __u32 rkey; + __u32 reserved; + } rdma; + struct { + __aligned_u64 remote_addr; + __aligned_u64 compare_add; + __aligned_u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u32 remote_qpn; + __u32 remote_qkey; + __u16 pkey_index; + } ud; + /* reg is only used by the kernel and is not part of the uapi */ + struct { + union { + struct ib_mr *mr; + __aligned_u64 reserved; + }; + __u32 key; + __u32 access; + } reg; + } wr; +}; + +struct rfc_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + +struct mminfo { + __aligned_u64 offset; + __u32 size; + __u32 pad; +}; + +struct rfc_dma_info { + __u32 length; + __u32 resid; + __u32 cur_sge; + __u32 num_sge; + __u32 sge_offset; + __u32 reserved; + union { + __u8 inline_data[0]; + struct rfc_sge sge[0]; + }; +}; + +struct rfc_send_wqe { + struct rfc_send_wr wr; + struct rfc_av av; + __u32 status; + __u32 state; + __aligned_u64 iova; + __u32 mask; + __u32 first_psn; + __u32 last_psn; + __u32 ack_length; + __u32 ssn; + __u32 has_rd_atomic; + struct rfc_dma_info dma; +}; + +struct rfc_recv_wqe { + __aligned_u64 wr_id; + __u32 num_sge; + __u32 padding; + struct rfc_dma_info dma; +}; + +struct rfc_create_cq_resp { + struct mminfo mi; +}; + +struct rfc_resize_cq_resp { + struct mminfo mi; +}; + +struct rfc_create_qp_resp { + struct mminfo rq_mi; + struct mminfo sq_mi; +}; + +struct rfc_create_srq_resp { + struct mminfo mi; + __u32 srq_num; + __u32 reserved; +}; + +struct rfc_modify_srq_cmd { + __aligned_u64 mmap_info_addr; +}; + +#endif /* RDMA_USER_RXE_H */ diff --git a/providers/rfc/CMakeLists.txt b/providers/rfc/CMakeLists.txt new file mode 100644 index 0000000..3123311 --- /dev/null +++ b/providers/rfc/CMakeLists.txt @@ -0,0 +1,8 @@ +rdma_provider(rfc + rfc.c + ) +rdma_subst_install(FILES "rfc_cfg.in" + RENAME "rfc_cfg" + DESTINATION "${CMAKE_INSTALL_BINDIR}" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE + ) diff --git a/providers/rfc/man/CMakeLists.txt b/providers/rfc/man/CMakeLists.txt new file mode 100644 index 0000000..145855c --- /dev/null +++ b/providers/rfc/man/CMakeLists.txt @@ -0,0 +1,4 @@ +rdma_man_pages( + rfc.7 + rfc_cfg.8 +) diff --git a/providers/rfc/man/rfc.7 b/providers/rfc/man/rfc.7 new file mode 100644 index 0000000..594d6cd --- /dev/null +++ b/providers/rfc/man/rfc.7 @@ -0,0 +1,77 @@ +.\" -*- nroff -*- +.\" +.TH RFC 7 2011-06-29 1.0.0 +.SH "NAME" +rfc \- Software RDMA over FC +.SH "SYNOPSIS" +\fBmodprobe rdma_rfc\fR +.br +This is usually performed by a configuration utility (see \fBrfc_cfg\fR(8).) + +.SH "DESCRIPTION" +The rdma_rfc kernel module provides a software implementation of RDMA over +Fibre channel. It encapsulates RDMA payloads in FC-NVMe READ/WRITE requests +and sends them over Fibre channel fabrics. +The InfiniBand (IB) Base Transport Header (BTH) is encapsulated in the FC-NVMe +header. + +Once a RFC instance has been created, communicating via RFC the same as +communicating via any OFED compatible Infiniband HCA, albeit in some cases with +addressing implications. + +Verbs applications written over IB verbs should work seamlessly except for the +following constraints in current release- +1. Partitioning is not supported. RFC module ignores any partition key in BTH. +2. Inline and Immediate data size >= 64KB is not supported. +3. only Reliable connection(RC) and Unreliable datagram(UD) type queue pairs + are supported. + +.SH "FILES" +.TP +\fB/sys/class/infiniband/rfc[0,1,...]\fR +Directory that holds RDMA device information. The format is the same as other RDMA devices. + +.TP +\fB/sys/module/rdma_rfc_net/parameters/add\fR +Write only file used by \fBrfc_cfg(8)\fR to add new RFC devices to existing Ethernet devices. + +.TP +\fB/sys/module/rdma_rfc_net/parameters/remove\fR +Write only file used by \fBrfc_cfg(8)\fR to remove RFC devices. + +.TP +\fB/sys/module/rdma_rfc/parameters/max_qp\fR +Read/Write file that sets a limit on the number of QPs allowed per RFC device. + +.TP +\fB/sys/module/rdma_rfc/parameters/max_qp_wr\fR +Read/Write file that sets a limit on the number of WRs per QP allowed per RFC device. + +.TP +\fB/sys/module/rdma_rfc/parameters/max_mr\fR +Read/Write file that sets a limit on the number of MRs allowed per RFC device. + +.TP +\fB/sys/module/rdma_rfc/parameters/max_fmr\fR +Read/Write file that sets a limit on the number of FMRs allowed per RFC device. + +.TP +\fB/sys/module/rdma_rfc/parameters/max_cq\fR +Read/Write file that sets a limit on the number of CQs allowed per RFC device. + +.TP +\fB/sys/module/rdma_rfc/parameters/max_log_cqe\fR +Read/Write file that sets a limit on the log base 2 of the number of CQEs per CQ allowed per RFC device. + +.TP +\fB/sys/module/rdma_rfc/parameters/max_inline_data\fR +Read/Write file that sets a limit on the maximum amount of inline data per WR allowed per RFC device. + +The above configuration parameters only affect a new RFC instance when it is created not afterwards. + +.SH "SEE ALSO" +.BR rfc_cfg (8), +.BR verbs (7), + +.SH "AUTHORS" +Written by Muneendra Kumar, Anand Sundaram, Amit Tyagi at Broadcom INC. diff --git a/providers/rfc/man/rfc_cfg.8 b/providers/rfc/man/rfc_cfg.8 new file mode 100644 index 0000000..8c12bbf --- /dev/null +++ b/providers/rfc/man/rfc_cfg.8 @@ -0,0 +1,70 @@ +.\" -*- nroff -*- +.\" +.TH RFC_CFG 8 2011-06-29 1.0.0 +.SH "NAME" +rfc_cfg \- rfc configuration tool for RFC (Soft RFC) +.SH "SYNOPSIS" +\fBrfc_cfg [status]\fR +.br +\fBrfc_cfg start\fR [\fB\-p\fR \fIproto\fR] +.br +\fBrfc_cfg stop\fR +.br +\fBrfc_cfg persistent\fR +.br +\fBrfc_cfg add\fR [\fB\-n\fR] \fIethN\fR +.br +\fBrfc_cfg remove\fR [\fB\-n\fR] \fIethN\fR|\fIrfcN\fR +.br +.SH "DESCRIPTION" +rfc_cfg is the configuration tool for the RFC software implementation of the RFC protocol. + +The RFC kernel modules are loaded, configured, reconfigured and unloaded via the various rfc_cfg command options, documented below. + +.SH "PARAMETERS" +.TP +\fIethN\fR +Network device name as listed in /sys/class/net. Only RFC Ethernet devices are supported; ie. rfcnet0. + +.TP +\fIrfcN\fR +RFC device name as listed in /sys/class/infiniband/. Examples are rfc0 or rfc1. + +.SH "COMMANDS" +.TP +[\fBstatus\fR] +The \fBstatus\fR command prints a table of information on available Ethernet devices and configured RFC instances. The status display is the default if no options are provided. + +.TP +\fBstart\fR [\fB\-p\fR \fIproto\fR] +The \fBstart\fR command loads the RFC modules and configures any persistent instances. + +.TP +\fBstop\fR +The \fBstop\fR command unconfigures all RFC instances and attempts to unload the kernel modules. + +.TP +\fBpersistent\fR +The \fBpersistent\fR command prints the list of Ethernet devices for which a RFC instance is persistently configured. + +.TP +\fBadd\fR [\fB\-n\fR] \fIethN\fR +The \fBadd\fR command will only configure a RFC instance on RFC Ethernet device \fIrfcnetN\fR (e.g. rfcnet0). The RFC modules must have already been loaded via \fBrfc_cfg start\fR. + +The default behavior is to add \fIrfcnetN\fR to a file of persistent configurations and the same RFC device will be configured the next time that \fBrfc_cfg start\fR is run. If the \fB-n\fR option is included the device is not added to the persistence file. + +.TP +\fBremove\fR [\fB\-n\fR] \fIethN\fR|\fIrfcN\fR +The \fBremove\fR command will remove the specified RFC instance. The parameter must match a currently active rfcnetN or rfcN name. + +If the \fB-n\fR option is included the RFC device will be removed but not removed from the persistent state. So it will be recreated the next time that \fBrfc_cfg start\fR is run. + +.SH "FILES" +.TP +\fB[PREFIX]/etc/rfc.conf\fR +RFC configuration file. Contains the list of persistent RFC instances. All persistent RFC instances can be removed by deleting this file (note this will take effect on the next "rfc_cfg start" -- to remove actively configured instances, you must "rfc_cfg stop"). + +.SH "SEE ALSO" +.BR rfc (7), +.SH "AUTHORS" +Written by Muneendra Kumar, Anand Sundaram, Amit Tyagi at Broadcom INC. diff --git a/providers/rfc/rfc-abi.h b/providers/rfc/rfc-abi.h new file mode 100644 index 0000000..a36a9ef --- /dev/null +++ b/providers/rfc/rfc-abi.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef RXE_ABI_H +#define RXE_ABI_H + +#include <infiniband/kern-abi.h> +#include <rdma/rdma_user_rfc.h> +#include <kernel-abi/rdma_user_rfc.h> + +DECLARE_DRV_CMD(urfc_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + empty, rfc_create_cq_resp); +DECLARE_DRV_CMD(urfc_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + empty, rfc_create_qp_resp); +DECLARE_DRV_CMD(urfc_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + empty, rfc_create_srq_resp); +DECLARE_DRV_CMD(urfc_modify_srq, IB_USER_VERBS_CMD_MODIFY_SRQ, + rfc_modify_srq_cmd, empty); +DECLARE_DRV_CMD(urfc_resize_cq, IB_USER_VERBS_CMD_RESIZE_CQ, + empty, rfc_resize_cq_resp); + +#endif /* RXE_ABI_H */ diff --git a/providers/rfc/rfc.c b/providers/rfc/rfc.c new file mode 100644 index 0000000..0611bc1 --- /dev/null +++ b/providers/rfc/rfc.c @@ -0,0 +1,926 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * Copyright (C) 2006-2007 QLogic Corporation, All rights reserved. + * Copyright (c) 2005. PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <pthread.h> +#include <netinet/in.h> +#include <sys/mman.h> +#include <errno.h> + +#include <endian.h> +#include <pthread.h> +#include <stddef.h> + +#include <infiniband/driver.h> +#include <infiniband/verbs.h> + +#include "rfc_queue.h" +#include "rfc-abi.h" +#include "rfc.h" + +static const struct verbs_match_ent hca_table[] = { + /* FIXME: rfc needs a more reliable way to detect the rfc device */ + VERBS_NAME_MATCH("rfc", NULL), + {}, +}; + +static int rfc_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, + &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%d", major, minor, sub_minor); + + return 0; +} + +static int rfc_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +static struct ibv_pd *rfc_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct ib_uverbs_alloc_pd_resp resp; + struct ibv_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof cmd, &resp, sizeof resp)) { + free(pd); + return NULL; + } + + return pd; +} + +static int rfc_dealloc_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (!ret) + free(pd); + + return ret; +} + +static struct ibv_mr *rfc_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + int access) +{ + struct ibv_mr *mr; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + int ret; + + mr = malloc(sizeof *mr); + if (!mr) { + return NULL; + } + + ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t)addr, access, mr, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) { + free(mr); + return NULL; + } + + return mr; +} + +static int rfc_dereg_mr(struct ibv_mr *mr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(mr); + if (ret) + return ret; + + free(mr); + return 0; +} + +static struct ibv_cq *rfc_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct rfc_cq *cq; + struct urfc_create_cq_resp resp; + int ret; + + cq = malloc(sizeof *cq); + if (!cq) { + return NULL; + } + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, NULL, 0, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(cq); + return NULL; + } + + cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, + context->cmd_fd, resp.mi.offset); + if ((void *)cq->queue == MAP_FAILED) { + ibv_cmd_destroy_cq(&cq->ibv_cq); + free(cq); + return NULL; + } + + cq->mmap_info = resp.mi; + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + + return &cq->ibv_cq; +} + +static int rfc_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct rfc_cq *cq = to_rcq(ibcq); + struct ibv_resize_cq cmd; + struct urfc_resize_cq_resp resp; + int ret; + + pthread_spin_lock(&cq->lock); + + ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + pthread_spin_unlock(&cq->lock); + return ret; + } + + munmap(cq->queue, cq->mmap_info.size); + + cq->queue = mmap(NULL, resp.mi.size, + PROT_READ | PROT_WRITE, MAP_SHARED, + ibcq->context->cmd_fd, resp.mi.offset); + + ret = errno; + pthread_spin_unlock(&cq->lock); + + if ((void *)cq->queue == MAP_FAILED) { + cq->queue = NULL; + cq->mmap_info.size = 0; + return ret; + } + + cq->mmap_info = resp.mi; + + return 0; +} + +static int rfc_destroy_cq(struct ibv_cq *ibcq) +{ + struct rfc_cq *cq = to_rcq(ibcq); + int ret; + + ret = ibv_cmd_destroy_cq(ibcq); + if (ret) + return ret; + + if (cq->mmap_info.size) + munmap(cq->queue, cq->mmap_info.size); + free(cq); + + return 0; +} + +static int rfc_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct rfc_cq *cq = to_rcq(ibcq); + struct rfc_queue *q; + int npolled; + uint8_t *src; + + pthread_spin_lock(&cq->lock); + q = cq->queue; + + for (npolled = 0; npolled < ne; ++npolled, ++wc) { + if (queue_empty(q)) + break; + + atomic_thread_fence(memory_order_acquire); + src = consumer_addr(q); + memcpy(wc, src, sizeof(*wc)); + advance_consumer(q); + } + + pthread_spin_unlock(&cq->lock); + return npolled; +} + +static struct ibv_srq *rfc_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct rfc_srq *srq; + struct ibv_create_srq cmd; + struct urfc_create_srq_resp resp; + int ret; + + srq = malloc(sizeof *srq); + if (srq == NULL) { + return NULL; + } + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(srq); + return NULL; + } + + srq->rq.queue = mmap(NULL, resp.mi.size, + PROT_READ | PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.mi.offset); + if ((void *)srq->rq.queue == MAP_FAILED) { + ibv_cmd_destroy_srq(&srq->ibv_srq); + free(srq); + return NULL; + } + + srq->mmap_info = resp.mi; + srq->rq.max_sge = attr->attr.max_sge; + pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE); + + return &srq->ibv_srq; +} + +static int rfc_modify_srq(struct ibv_srq *ibsrq, + struct ibv_srq_attr *attr, int attr_mask) +{ + struct rfc_srq *srq = to_rsrq(ibsrq); + struct urfc_modify_srq cmd; + int rc = 0; + struct mminfo mi; + + mi.offset = 0; + mi.size = 0; + + if (attr_mask & IBV_SRQ_MAX_WR) + pthread_spin_lock(&srq->rq.lock); + + cmd.mmap_info_addr = (__u64)(uintptr_t) & mi; + rc = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, + &cmd.ibv_cmd, sizeof cmd); + if (rc) + goto out; + + if (attr_mask & IBV_SRQ_MAX_WR) { + (void)munmap(srq->rq.queue, srq->mmap_info.size); + srq->rq.queue = mmap(NULL, mi.size, + PROT_READ | PROT_WRITE, MAP_SHARED, + ibsrq->context->cmd_fd, mi.offset); + + if ((void *)srq->rq.queue == MAP_FAILED) { + rc = errno; + srq->rq.queue = NULL; + srq->mmap_info.size = 0; + goto out; + } + + srq->mmap_info = mi; + } + +out: + if (attr_mask & IBV_SRQ_MAX_WR) + pthread_spin_unlock(&srq->rq.lock); + return rc; +} + +static int rfc_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +static int rfc_destroy_srq(struct ibv_srq *ibvsrq) +{ + int ret; + struct rfc_srq *srq = to_rsrq(ibvsrq); + struct rfc_queue *q = srq->rq.queue; + + ret = ibv_cmd_destroy_srq(ibvsrq); + if (!ret) { + if (srq->mmap_info.size) + munmap(q, srq->mmap_info.size); + free(srq); + } + + return ret; +} + +static int rfc_post_one_recv(struct rfc_wq *rq, struct ibv_recv_wr *recv_wr) +{ + int i; + struct rfc_recv_wqe *wqe; + struct rfc_queue *q = rq->queue; + int length = 0; + int rc = 0; + + if (queue_full(q)) { + rc = -ENOMEM; + goto out; + } + + if (recv_wr->num_sge > rq->max_sge) { + rc = -EINVAL; + goto out; + } + + wqe = (struct rfc_recv_wqe *)producer_addr(q); + + wqe->wr_id = recv_wr->wr_id; + wqe->num_sge = recv_wr->num_sge; + + memcpy(wqe->dma.sge, recv_wr->sg_list, + wqe->num_sge*sizeof(*wqe->dma.sge)); + + for (i = 0; i < wqe->num_sge; i++) { + length += wqe->dma.sge[i].length; + } + + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.cur_sge = 0; + wqe->dma.num_sge = wqe->num_sge; + wqe->dma.sge_offset = 0; + + advance_producer(q); + +out: + return rc; +} + +static int rfc_post_srq_recv(struct ibv_srq *ibvsrq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr) +{ + struct rfc_srq *srq = to_rsrq(ibvsrq); + int rc = 0; + + pthread_spin_lock(&srq->rq.lock); + + while (recv_wr) { + rc = rfc_post_one_recv(&srq->rq, recv_wr); + if (rc) { + *bad_recv_wr = recv_wr; + break; + } + + recv_wr = recv_wr->next; + } + + pthread_spin_unlock(&srq->rq.lock); + + return rc; +} + +static struct ibv_qp *rfc_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct urfc_create_qp_resp resp; + struct rfc_qp *qp; + int ret; + + qp = malloc(sizeof *qp); + if (!qp) { + return NULL; + } + + ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(qp); + return NULL; + } + + if (attr->srq) { + qp->rq.max_sge = 0; + qp->rq.queue = NULL; + qp->rq_mmap_info.size = 0; + } else { + qp->rq.max_sge = attr->cap.max_recv_sge; + qp->rq.queue = mmap(NULL, resp.rq_mi.size, PROT_READ | PROT_WRITE, + MAP_SHARED, + pd->context->cmd_fd, resp.rq_mi.offset); + if ((void *)qp->rq.queue == MAP_FAILED) { + ibv_cmd_destroy_qp(&qp->ibv_qp); + free(qp); + return NULL; + } + + qp->rq_mmap_info = resp.rq_mi; + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE); + } + + qp->sq.max_sge = attr->cap.max_send_sge; + qp->sq.max_inline = attr->cap.max_inline_data; + qp->sq.queue = mmap(NULL, resp.sq_mi.size, PROT_READ | PROT_WRITE, + MAP_SHARED, + pd->context->cmd_fd, resp.sq_mi.offset); + if ((void *)qp->sq.queue == MAP_FAILED) { + if (qp->rq_mmap_info.size) + munmap(qp->rq.queue, qp->rq_mmap_info.size); + ibv_cmd_destroy_qp(&qp->ibv_qp); + free(qp); + return NULL; + } + + qp->sq_mmap_info = resp.sq_mi; + pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE); + + return &qp->ibv_qp; +} + +static int rfc_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, + &cmd, sizeof cmd); +} + +static int rfc_modify_qp(struct ibv_qp *ibvqp, + struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + + return ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof cmd); +} + +static int rfc_destroy_qp(struct ibv_qp *ibv_qp) +{ + int ret; + struct rfc_qp *qp = to_rqp(ibv_qp); + + ret = ibv_cmd_destroy_qp(ibv_qp); + if (!ret) { + if (qp->rq_mmap_info.size) + munmap(qp->rq.queue, qp->rq_mmap_info.size); + if (qp->sq_mmap_info.size) + munmap(qp->sq.queue, qp->sq_mmap_info.size); + + free(qp); + } + + return ret; +} + +/* basic sanity checks for send work request */ +static int validate_send_wr(struct rfc_wq *sq, struct ibv_send_wr *ibwr, + unsigned int length) +{ + enum ibv_wr_opcode opcode = ibwr->opcode; + + if (ibwr->num_sge > sq->max_sge) + return -EINVAL; + + if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) + || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) + if (length < 8 || ibwr->wr.atomic.remote_addr & 0x7) + return -EINVAL; + + if ((ibwr->send_flags & IBV_SEND_INLINE) && (length > sq->max_inline)) + return -EINVAL; + + return 0; +} + +static void convert_send_wr(struct rfc_send_wr *kwr, struct ibv_send_wr *uwr) +{ + memset(kwr, 0, sizeof(*kwr)); + + kwr->wr_id = uwr->wr_id; + kwr->num_sge = uwr->num_sge; + kwr->opcode = uwr->opcode; + kwr->send_flags = uwr->send_flags; + kwr->ex.imm_data = uwr->imm_data; + + switch(uwr->opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_READ: + kwr->wr.rdma.remote_addr = uwr->wr.rdma.remote_addr; + kwr->wr.rdma.rkey = uwr->wr.rdma.rkey; + break; + + case IBV_WR_SEND: + case IBV_WR_SEND_WITH_IMM: + kwr->wr.ud.remote_qpn = uwr->wr.ud.remote_qpn; + kwr->wr.ud.remote_qkey = uwr->wr.ud.remote_qkey; + break; + + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + kwr->wr.atomic.remote_addr = uwr->wr.atomic.remote_addr; + kwr->wr.atomic.compare_add = uwr->wr.atomic.compare_add; + kwr->wr.atomic.swap = uwr->wr.atomic.swap; + kwr->wr.atomic.rkey = uwr->wr.atomic.rkey; + break; + + case IBV_WR_LOCAL_INV: + case IBV_WR_BIND_MW: + case IBV_WR_SEND_WITH_INV: + case IBV_WR_TSO: + break; + } +} + +static int init_send_wqe(struct rfc_qp *qp, struct rfc_wq *sq, + struct ibv_send_wr *ibwr, unsigned int length, + struct rfc_send_wqe *wqe) +{ + int num_sge = ibwr->num_sge; + int i; + unsigned int opcode = ibwr->opcode; + + convert_send_wr(&wqe->wr, ibwr); + + if (qp_type(qp) == IBV_QPT_UD) + memcpy(&wqe->av, &to_rah(ibwr->wr.ud.ah)->av, + sizeof(struct rfc_av)); + + if (ibwr->send_flags & IBV_SEND_INLINE) { + uint8_t *inline_data = wqe->dma.inline_data; + + for (i = 0; i < num_sge; i++) { + memcpy(inline_data, + (uint8_t *)(long)ibwr->sg_list[i].addr, + ibwr->sg_list[i].length); + inline_data += ibwr->sg_list[i].length; + } + } else + memcpy(wqe->dma.sge, ibwr->sg_list, + num_sge*sizeof(struct ibv_sge)); + + if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) + || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) + wqe->iova = ibwr->wr.atomic.remote_addr; + else + wqe->iova = ibwr->wr.rdma.remote_addr; + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.num_sge = num_sge; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + wqe->state = 0; + wqe->ssn = qp->ssn++; + + return 0; +} + +static int post_one_send(struct rfc_qp *qp, struct rfc_wq *sq, + struct ibv_send_wr *ibwr) +{ + int err; + struct rfc_send_wqe *wqe; + unsigned int length = 0; + int i; + + for (i = 0; i < ibwr->num_sge; i++) + length += ibwr->sg_list[i].length; + + err = validate_send_wr(sq, ibwr, length); + if (err) { + printf("validate send failed\n"); + return err; + } + + wqe = (struct rfc_send_wqe *)producer_addr(sq->queue); + + err = init_send_wqe(qp, sq, ibwr, length, wqe); + if (err) + return err; + + if (queue_full(sq->queue)) + return -ENOMEM; + + advance_producer(sq->queue); + + return 0; +} + +/* send a null post send as a doorbell */ +static int post_send_db(struct ibv_qp *ibqp) +{ + struct ibv_post_send cmd; + struct ib_uverbs_post_send_resp resp; + + cmd.hdr.command = IB_USER_VERBS_CMD_POST_SEND; + cmd.hdr.in_words = sizeof(cmd) / 4; + cmd.hdr.out_words = sizeof(resp) / 4; + cmd.response = (uintptr_t)&resp; + cmd.qp_handle = ibqp->handle; + cmd.wr_count = 0; + cmd.sge_count = 0; + cmd.wqe_size = sizeof(struct ibv_send_wr); + + if (write(ibqp->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) + return errno; + + return 0; +} + +/* this API does not make a distinction between + restartable and non-restartable errors */ +static int rfc_post_send(struct ibv_qp *ibqp, + struct ibv_send_wr *wr_list, + struct ibv_send_wr **bad_wr) +{ + int rc = 0; + int err; + struct rfc_qp *qp = to_rqp(ibqp); + struct rfc_wq *sq = &qp->sq; + + if (!bad_wr) + return EINVAL; + + *bad_wr = NULL; + + if (!sq || !wr_list || !sq->queue) + return EINVAL; + + pthread_spin_lock(&sq->lock); + + while (wr_list) { + rc = post_one_send(qp, sq, wr_list); + if (rc) { + *bad_wr = wr_list; + break; + } + + wr_list = wr_list->next; + } + + pthread_spin_unlock(&sq->lock); + + err = post_send_db(ibqp); + return err ? err : rc; +} + +static int rfc_post_recv(struct ibv_qp *ibqp, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_wr) +{ + int rc = 0; + struct rfc_qp *qp = to_rqp(ibqp); + struct rfc_wq *rq = &qp->rq; + + if (!bad_wr) + return EINVAL; + + *bad_wr = NULL; + + if (!rq || !recv_wr || !rq->queue) + return EINVAL; + + pthread_spin_lock(&rq->lock); + + while (recv_wr) { + rc = rfc_post_one_recv(rq, recv_wr); + if (rc) { + *bad_wr = recv_wr; + break; + } + + recv_wr = recv_wr->next; + } + + pthread_spin_unlock(&rq->lock); + + return rc; +} + +static inline int ipv6_addr_v4mapped(const struct in6_addr *a) +{ + return IN6_IS_ADDR_V4MAPPED(a); +} + +typedef typeof(((struct rfc_av *)0)->sgid_addr) sockaddr_union_t; + +static inline int rdma_gid2ip(sockaddr_union_t *out, union ibv_gid *gid) +{ + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { + memset(&out->_sockaddr_in, 0, sizeof(out->_sockaddr_in)); + memcpy(&out->_sockaddr_in.sin_addr.s_addr, gid->raw + 12, 4); + } else { + memset(&out->_sockaddr_in6, 0, sizeof(out->_sockaddr_in6)); + out->_sockaddr_in6.sin6_family = AF_INET6; + memcpy(&out->_sockaddr_in6.sin6_addr.s6_addr, gid->raw, 16); + } + return 0; +} + +static struct ibv_ah *rfc_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + int err; + struct rfc_ah *ah; + struct rfc_av *av; + union ibv_gid sgid; + struct ib_uverbs_create_ah_resp resp; + + err = ibv_query_gid(pd->context, attr->port_num, attr->grh.sgid_index, + &sgid); + if (err) { + fprintf(stderr, "rfc: Failed to query sgid.\n"); + return NULL; + } + + ah = malloc(sizeof *ah); + if (ah == NULL) + return NULL; + + av = &ah->av; + av->port_num = attr->port_num; + memcpy(&av->grh, &attr->grh, sizeof(attr->grh)); + av->network_type = + ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ? + RDMA_NETWORK_IPV4 : RDMA_NETWORK_IPV6; + + rdma_gid2ip(&av->sgid_addr, &sgid); + rdma_gid2ip(&av->dgid_addr, &attr->grh.dgid); + + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp, sizeof(resp))) { + free(ah); + return NULL; + } + + return &ah->ibv_ah; +} + +static int rfc_destroy_ah(struct ibv_ah *ibah) +{ + int ret; + struct rfc_ah *ah = to_rah(ibah); + + ret = ibv_cmd_destroy_ah(&ah->ibv_ah); + if (ret) + return ret; + + free(ah); + return 0; +} + +static const struct verbs_context_ops rfc_ctx_ops = { + .query_device = rfc_query_device, + .query_port = rfc_query_port, + .alloc_pd = rfc_alloc_pd, + .dealloc_pd = rfc_dealloc_pd, + .reg_mr = rfc_reg_mr, + .dereg_mr = rfc_dereg_mr, + .create_cq = rfc_create_cq, + .poll_cq = rfc_poll_cq, + .req_notify_cq = ibv_cmd_req_notify_cq, + .resize_cq = rfc_resize_cq, + .destroy_cq = rfc_destroy_cq, + .create_srq = rfc_create_srq, + .modify_srq = rfc_modify_srq, + .query_srq = rfc_query_srq, + .destroy_srq = rfc_destroy_srq, + .post_srq_recv = rfc_post_srq_recv, + .create_qp = rfc_create_qp, + .query_qp = rfc_query_qp, + .modify_qp = rfc_modify_qp, + .destroy_qp = rfc_destroy_qp, + .post_send = rfc_post_send, + .post_recv = rfc_post_recv, + .create_ah = rfc_create_ah, + .destroy_ah = rfc_destroy_ah, + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast +}; + +static struct verbs_context *rfc_alloc_context(struct ibv_device *ibdev, + int cmd_fd) +{ + struct rfc_context *context; + struct ibv_get_context cmd; + struct ib_uverbs_get_context_resp resp; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_RXE); + if (!context) + return NULL; + + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, + sizeof cmd, &resp, sizeof resp)) + goto out; + + verbs_set_ops(&context->ibv_ctx, &rfc_ctx_ops); + + return &context->ibv_ctx; + +out: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void rfc_free_context(struct ibv_context *ibctx) +{ + struct rfc_context *context = to_rctx(ibctx); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void rfc_uninit_device(struct verbs_device *verbs_device) +{ + struct rfc_device *dev = to_rdev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device *rfc_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct rfc_device *dev; + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->abi_version = sysfs_dev->abi_ver; + + return &dev->ibv_dev; +} + +static const struct verbs_device_ops rfc_dev_ops = { + .name = "rfc", + /* + * For 64 bit machines ABI version 1 and 2 are the same. Otherwise 32 + * bit machines require ABI version 2 which guarentees the user and + * kernel use the same ABI. + */ + .match_min_abi_version = sizeof(void *) == 8?1:2, + .match_max_abi_version = 2, + .match_table = hca_table, + .alloc_device = rfc_device_alloc, + .uninit_device = rfc_uninit_device, + .alloc_context = rfc_alloc_context, + .free_context = rfc_free_context, +}; +PROVIDER_DRIVER(rfc_dev_ops); diff --git a/providers/rfc/rfc.h b/providers/rfc/rfc.h new file mode 100644 index 0000000..8313b19 --- /dev/null +++ b/providers/rfc/rfc.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corp. All rights reserved. + * Copyright (c) 2005. PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_H +#define RXE_H + +#include <infiniband/driver.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <rdma/rdma_user_rfc.h> /* struct rfc_av */ +#include "rfc-abi.h" + +enum rdma_network_type { + RDMA_NETWORK_IB, + RDMA_NETWORK_IPV4, + RDMA_NETWORK_IPV6 +}; + +struct rfc_device { + struct verbs_device ibv_dev; + int abi_version; +}; + +struct rfc_context { + struct verbs_context ibv_ctx; +}; + +struct rfc_cq { + struct ibv_cq ibv_cq; + struct mminfo mmap_info; + struct rfc_queue *queue; + pthread_spinlock_t lock; +}; + +struct rfc_ah { + struct ibv_ah ibv_ah; + struct rfc_av av; +}; + +struct rfc_wq { + struct rfc_queue *queue; + pthread_spinlock_t lock; + unsigned int max_sge; + unsigned int max_inline; +}; + +struct rfc_qp { + struct ibv_qp ibv_qp; + struct mminfo rq_mmap_info; + struct rfc_wq rq; + struct mminfo sq_mmap_info; + struct rfc_wq sq; + unsigned int ssn; +}; + +#define qp_type(qp) ((qp)->ibv_qp.qp_type) + +struct rfc_srq { + struct ibv_srq ibv_srq; + struct mminfo mmap_info; + struct rfc_wq rq; + uint32_t srq_num; +}; + +#define to_rxxx(xxx, type) container_of(ib##xxx, struct rfc_##type, ibv_##xxx) + +static inline struct rfc_context *to_rctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct rfc_context, ibv_ctx.context); +} + +static inline struct rfc_device *to_rdev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct rfc_device, ibv_dev.device); +} + +static inline struct rfc_cq *to_rcq(struct ibv_cq *ibcq) +{ + return to_rxxx(cq, cq); +} + +static inline struct rfc_qp *to_rqp(struct ibv_qp *ibqp) +{ + return to_rxxx(qp, qp); +} + +static inline struct rfc_srq *to_rsrq(struct ibv_srq *ibsrq) +{ + return to_rxxx(srq, srq); +} + +static inline struct rfc_ah *to_rah(struct ibv_ah *ibah) +{ + return to_rxxx(ah, ah); +} + +#endif /* RXE_H */ diff --git a/providers/rfc/rfc_cfg.in b/providers/rfc/rfc_cfg.in new file mode 100755 index 0000000..0a8583d --- /dev/null +++ b/providers/rfc/rfc_cfg.in @@ -0,0 +1,674 @@ +#!/usr/bin/perl + +# * Copyright (c) 2009-2011 Mellanox Technologies Ltd. All rights reserved. +# * Copyright (c) 2009-2011 System Fabric Works, Inc. All rights reserved. +# * +# * This software is available to you under a choice of one of two +# * licenses. You may choose to be licensed under the terms of the GNU +# * General Public License (GPL) Version 2, available from the file +# * COPYING in the main directory of this source tree, or the +# * OpenIB.org BSD license below: +# * +# * Redistribution and use in source and binary forms, with or +# * without modification, are permitted provided that the following +# * conditions are met: +# * +# * - Redistributions of source code must retain the above +# * copyright notice, this list of conditions and the following +# * disclaimer. +# * +# * - Redistributions in binary form must reproduce the above +# * copyright notice, this list of conditions and the following +# * disclaimer in the documentation and/or other materials +# * provided with the distribution. +# * +# * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# * SOFTWARE. +# + +use warnings; +use strict; + +use File::Basename; +use File::Path qw(make_path); +use Getopt::Long; + +my $help = 0; +my $no_persist = 0; +my $debug = 0; +my $force = 0; +my $linkonly = 0; +my $parms = "/sys/module/rdma_rxe/parameters"; +my $modprobe_opt = ""; +my $modprobe_checked = "0"; +my $persistence_path = "@CMAKE_INSTALL_FULL_SHAREDSTATEDIR@/rxe"; +my $persistence_file = "${persistence_path}/rxe"; +my $num_persistent = 0; +my $sys = "/sys/module/rdma_rxe/parameters"; +my %rxe_names; +my @rxe_array; +my %eth_names; +my @eth_list; +my %eth_driver; +my %link_state; +my %link_speed; +my %eth_mtu; +my %ipv4_addr; +my %rxe_mtu; +my @persistence_array; +my %persistence_hash; +my @mlx4_port; +my @mlx4_ether; +my @roce_list; + +# Read a file and return its contents as a string. +sub read_file { + my $filename = shift; + my $result = ""; + + if (open(FILE, $filename)) { + $result = <FILE>; + close FILE; + } + return $result; +} + +#get mapping between rxe and eth devices +sub get_names { + my $i = 0; + + foreach my $rxe (glob("/sys/class/infiniband/rxe*")) { + $rxe = basename($rxe); + my $eth = read_file("/sys/class/infiniband/$rxe/parent"); + chomp($eth); + + if (($eth =~ /[\w]+[\d]/) + && ($rxe =~ /rxe[0123456789]/)) { + + # hash ethername to rxename + $rxe_names{$eth} = $rxe; + $rxe_array[$i++] = $rxe; + + # hash rxename to ethername + $eth_names{$rxe} = $eth; + } + } +} + +# get list of Mellanox RoCE ports +sub get_mlx4_list { + my $i = 0; + + foreach my $mlx4 (glob("/sys/class/infiniband/mlx4_*")) { + $mlx4 = basename($mlx4); + foreach my $port (glob("/sys/class/infiniband/$mlx4/ports/*")) { + $port = basename($port); + my $link = read_file("$port/link_layer"); + chomp($link); + + if ($link =~ "Ethernet") { + $roce_list[$i++] = "$mlx4:$port"; + } + } + } +} + +#collect per device information +sub get_dev_info { + my @list; + my @fields; + my @lines; + my $line; + my $eth; + my $drv; + my $np; + my $i = 0; + my $j = 0; + + get_mlx4_list(); + + my @my_eth_list = (); + foreach my $my_eth_dev (glob("/sys/class/net/*")) { + $my_eth_dev = basename($my_eth_dev); + my $my_dev_type = read_file("/sys/class/net/${my_eth_dev}/type"); + chomp($my_dev_type); + if ($my_dev_type == "1") { + push(@my_eth_list, "$my_eth_dev"); + } + } + + @list = @my_eth_list; + foreach $eth (@list) { + chomp($eth); + + $eth_list[$i++] = $eth; + + @lines = `ethtool -i $eth`; + foreach $line (@lines) { + chomp($line); + + @fields = split(/\s+/, $line); + chomp($fields[0]); + + if ($fields[0] =~ /driver:/) { + $drv = $fields[1]; + $eth_driver{$eth} = $drv; + + if ($drv =~ /mlx4_en/ && scalar(@roce_list) > 0 ) { + $eth_names{$roce_list[$j++]} = $eth; + } + } + } + + # get link status + $link_state{$eth} = ""; + $link_speed{$eth} = ""; + + @lines = `ethtool $eth`; + foreach $line (@lines) { + chomp($line); + + @fields = split(/:/, $line); + if (defined($fields[1])) { + $fields[1] =~ s/^\s+//g; + if ($fields[0] =~ "Link detected") { + $link_state{$eth} = $fields[1]; + } + } + elsif ($line =~ "10000baseT") { + $link_speed{$eth} = "10GigE"; + } + } + + $ipv4_addr{$eth} = " "; + $eth_mtu{$eth} = ""; + + @lines = `ifconfig $eth`; + foreach $line (@lines) { + # get IP address + if ($line =~ /inet addr/) { + $line =~ s/^\s+inet addr://g; + @fields = split(/\s+/, $line); + $ipv4_addr{$eth} = $fields[0]; + } + + # get ethernet mtu + if ($line =~ /MTU:/) { + $line =~ s/^.*MTU://g; + @fields = split(/\s+/, $line); + $eth_mtu{$eth} = $fields[0]; + } + } + } + + # get rxe mtu + foreach my $rxe (@rxe_array) { + + @lines = `ibv_devinfo -d $rxe`; + foreach $line (@lines) { + if ($line =~ "active_mtu") { + $line =~ s/^\s+active_mtu:\s+//g; + chomp($line); + + $rxe_mtu{$rxe} = $line; + } + } + $rxe_mtu{$rxe} = "(?)" if (!$rxe_mtu{$rxe}); + } +} + +# return string or the string "###" if string is all whitespace +sub set_field { + my $fld = $_[0]; + + if (defined($fld) && $fld =~ /\S/) { + return $fld; + } else { + return "###"; + } +} + +# format status output into fixed width columns +sub status_print { + my @fields; + my $field; + my @flen = (); + my $num_fields = 0; + my $i; + my $pad; + my $line; + + # one pass to size the columns + foreach $line (@_) { + @fields = split(/\s+/, $line); + $i = 0; + foreach $field (@fields) { + if (!defined($flen[$i])) { + $flen[$i] = length($field); + } + else { + $flen[$i] = max($flen[$i], length($field)); + } + $i++; + } + + if ($i > $num_fields) { + $num_fields = $i; + } + } + + # one pass to print + foreach $line (@_) { + print " "; + @fields = split(/\s+/, $line); + for ($i = 0; $i < $num_fields; $i++) { + if (defined($fields[$i])) { + $pad = $flen[$i] - length($fields[$i]) + 2; + } + else { + $pad = $flen[$i] + 2; + } + if (defined($fields[$i]) && ($fields[$i] ne "###")) { + print "$fields[$i]"; + } + else { + print " "; + } + printf("%*s", $pad, ""); + } + print "\n"; + } +} + +# check driver load status +sub check_module_status { + if (-e $sys) { + return 0; + } else { + return 1; + } +} + +# print driver load status and ethertype for rdma_rxe and rdma_rxe_net +sub show_module_status { + print "rdma_rxe module not loaded\n" if (!(-e $sys)); +} + +# print rxe status +sub do_status { + my $instance = $_[0]; + my $ln = 0; + my @outp; + my $rxe; + my $rmtu; + + get_names(); + get_dev_info(); + show_module_status(); + + $outp[$ln++] = "Name\tLink\tDriver\t\tSpeed\tNMTU\tIPv4_addr\tRDEV\tRMTU"; + + foreach my $eth (@eth_list) { + + # handle case where rxe_drivers are not loaded + if (defined($rxe_names{$eth})) { + $rxe = $rxe_names{$eth}; + $rmtu = $rxe_mtu{$rxe}; + } + else { + $rxe = ""; + $rmtu = ""; + } + + if ((!defined($instance) + && (($linkonly == 0) || ($link_state{$eth} =~ "yes"))) + || (defined($instance) && ($rxe =~ "$instance"))) { + $outp[$ln] = set_field("$eth"); + $outp[$ln] .= "\t"; + $outp[$ln] .= set_field("$link_state{$eth}"); + $outp[$ln] .= "\t"; + $outp[$ln] .= set_field(exists($eth_driver{$eth}) ? $eth_driver{$eth} : ""); + $outp[$ln] .= "\t"; + $outp[$ln] .= set_field("$link_speed{$eth}"); + $outp[$ln] .= "\t"; + $outp[$ln] .= set_field("$eth_mtu{$eth}"); + $outp[$ln] .= "\t"; + $outp[$ln] .= set_field("$ipv4_addr{$eth}"); + $outp[$ln] .= "\t"; + $outp[$ln] .= set_field("$rxe"); + $outp[$ln] .= "\t"; + $outp[$ln] .= set_field("$rmtu"); + $ln++; + } + } + + status_print(@outp); +} + +# read file containing list of ethernet devices into a list +sub populate_persistence { + my $i = 0; + + open FILE, $persistence_file; + while(<FILE>) { + my $line = $_; + chomp($line); + $line =~ s/^\s+//g; + if ($line =~ /[\w]+[\d]/) { + # in case we add fields later + my ($eth, $cruft) = split(/\s+/, $line, 2); + if ($eth =~ /^[\w]+[\d]/) { + $persistence_array[$i] = $eth; + $persistence_hash{$eth} = $i++; + } + } + } + close FILE; + + $num_persistent = $i; +} + +# print out list of ethernet devices to file +sub commit_persistent { + my $i; + my $eth; + + open(PF, ">$persistence_file"); + + for ($i = 0; $i < $num_persistent; $i++) { + $eth = $persistence_array[$i]; + if ($eth =~ /[\w]+[\d]/) { + print(PF "$persistence_array[$i]\n"); + } + } + + close(PF); +} + +sub delete_persistent { + my $eth = $_[0]; + + if (defined($persistence_hash{$eth})) { + $persistence_array[$persistence_hash{$eth}] = ""; + } +} + +sub add_persistent { + my $eth = $_[0]; + + # Is this one already in the persistence list? + if (!defined($persistence_hash{$eth})) { + $persistence_array[$num_persistent] = $eth; + $persistence_hash{$eth} = $num_persistent; + $num_persistent++; + } +} + +# add new rxe device to eth if not already up +sub rxe_add { + my $eth = $_[0]; + + if (!($eth =~ /[\w]+[\d]/)) { + print "eth_name ($eth) looks bogus\n"; + return; + } + + if (!defined($rxe_names{$eth})) { + system("echo '$eth' > $parms/add"); + } + if (!$no_persist) { + add_persistent($eth); + commit_persistent(); + } +} + +sub rxe_remove { + my $arg2 = $_[0]; + my $rxe; + my $eth; + + print "remove $arg2\n" if ($debug > 0); + + if ($arg2 =~ /[\w]+[\d]/) { + $eth = $arg2; + $rxe = $rxe_names{$eth}; + } + elsif ($arg2 =~ /rxe[0123456789]/) { + $rxe = $arg2; + $eth = $eth_names{$rxe}; + } + elsif ($arg2 eq "all") { + $rxe = "all"; + } + + if (($rxe eq "all") || ($rxe =~ /^rxe[0123456789]/)) { + my $cmd = "echo '$rxe' > $parms/remove"; + #print "$cmd\n"; + system($cmd); + if (!$no_persist) { + if ($rxe eq "all") { + unlink($persistence_file); + } + elsif ($eth =~/[\w]+[\d]/) { + delete_persistent($eth); + commit_persistent(); + } + else { + print "Warning: Unable to resolve ethname; " + . "instance may persist on restart\n"; + } + } + } + else { + print "rxe instance $rxe not found\n"; + } +} + +sub get_devinfo { + my $rxe = $_[0]; + + my $cmd = "ibv_devinfo -d $rxe"; + return `$cmd`; +} + +# allow unsupported modules to load in SLES11 if allowed +sub modprobe { + my $module = $_[0]; + my $opts = $_[1]; + my @lines; + my $line; + + if ($modprobe_checked == "0") { + @lines = `modprobe -c`; + foreach $line (@lines) { + if ($line =~ /^allow_unsupported_modules *0/) { + $modprobe_opt = " --allow-unsupported-modules "; + last; + } + } + $modprobe_checked = "1"; + } + + if (!defined($opts)) { + $opts = ""; + } + + system("modprobe $modprobe_opt $module $opts"); +} + +# bring up rxe +sub do_start { + my $proto_str = ""; + + system("mkdir -p $persistence_path"); + system("touch $persistence_file"); + + modprobe("ib_core"); + modprobe("ib_uverbs"); + modprobe("rdma_ucm"); + modprobe("rdma_rxe"); + + populate_persistence(); + + foreach my $eth (@persistence_array) { + rxe_add($eth); + } + + get_names(); + + foreach my $rxe (@rxe_array) { + my $stat = get_devinfo($rxe); + if ($stat =~ "PORT_DOWN") { + my $cmd = "ifconfig $eth_names{$rxe} up"; + system($cmd); + } + } + +} + +# check if argument is an integer +sub is_integer { + defined $_[0] && $_[0] =~ /^[+-]?\d+$/; +} + +# remove all rxe devices and unload drivers +sub do_stop { + my $rxe; + + foreach $rxe (@rxe_array) { + system("echo '$rxe' > $sys/remove"); + } + + if (-e $sys) { + system("rmmod rdma_rxe"); + } + + if (-e $sys) { + print "unable to unload drivers, reboot required\n"; + } +} + +sub do_debug { + my $arg2 = $_[0]; + my $debugfile = "$parms/debug"; + chomp($arg2); + + if (!(-e "$debugfile")) { + print "Error: debug is compiled out of this rxe driver\n"; + return; + } + + if ($arg2 eq "on") { system("echo '31' > $debugfile"); } + elsif ($arg2 eq "off") { system("echo '0' > $debugfile"); } + elsif ($arg2 eq "0") { system("echo '0' > $debugfile"); } + elsif ($arg2 eq "") { } + elsif ($arg2 ge "0" && $arg2 le "31") { + system("echo '$arg2' > $debugfile"); + } + else { + print "unrecognized debug cmd ($arg2)\n"; + } + + my $current = read_file($debugfile); + chomp($current); + if ($current > 0) { + print "Debug is ON ($current)\n"; + } + elsif ($current == 0) { + print "Debug is OFF\n"; + } + else { + print "Unrecognized debug value\n"; + } +} + +sub max { + my $a = $_[0]; + my $b = $_[1]; + return $a if ($a > $b); + return $b; +} + +# show usage for rxe_cfg +sub usage { + print " Usage:\n"; + print " rxe_cfg [options] start|stop|status|persistent\n"; + print " rxe_cfg debug on|off|<num>\n"; + print " rxe_cfg [-n] add <ndev>\n"; + print " rxe_cfg [-n] remove <ndev>|<rdev>\n"; + print "\n"; + print " <ndev> = network device e.g. eth3\n"; + print " <rdev> = rdma device e.g. rxe1\n"; + print "\n"; + print " Options:\n"; + print " -h: print this usage information\n"; + print " -n: do not make the configuration action persistent\n"; + print " -v: print additional debug output\n"; + print " -l: show status for interfaces with link up\n"; + print " -p <num>: (start command only) - set ethertype\n"; +} + +sub main { + GetOptions( + "-h" => \$help, + "--help" => \$help, + "-n" => \$no_persist, + "-v:+" => \$debug, + "-f" => \$force, + "-l" => \$linkonly, + ); + + my $arg1 = $ARGV[0]; + my $arg2 = $ARGV[1]; + my $arg3 = $ARGV[2]; + + # status is the default + if (!defined($arg1) || ($arg1 =~ /status/)) { + do_status($arg2); + exit; + } + + if ($help) { + usage(); + exit; + } + + # stuff that does not require modules to be loaded + if ($arg1 eq "help") { usage(); exit; } + elsif ($arg1 eq "start") { do_start(); do_status(); exit; } + elsif ($arg1 eq "persistent") { system("cat $persistence_file"); exit; } + + + # can't do much else, bail if modules aren't loaded + if (check_module_status()) { + exit; + } + + # create persistence file if necessary + make_path($persistence_path); + if (!(-e $persistence_file)) { + `touch $persistence_file`; + } + + # Get full context of the configuration + populate_persistence(); + get_names(); + get_dev_info(); + + # Stuff that requires the rdma_rxe module to be loaded + if ($arg1 eq "stop") { do_stop(); exit; } + elsif ($arg1 eq "debug") { do_debug($arg2); exit; } + elsif ($arg1 eq "add") { rxe_add($arg2); exit; } + elsif ($arg1 eq "remove") { rxe_remove($arg2); exit; } + elsif ($arg1 eq "help") { usage(); exit; } +} + +main(); + +exit; diff --git a/providers/rfc/rfc_queue.h b/providers/rfc/rfc_queue.h new file mode 100644 index 0000000..a82e223 --- /dev/null +++ b/providers/rfc/rfc_queue.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the fileA + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* implements a simple circular buffer with sizes a power of 2 */ + +#ifndef H_RXE_PCQ +#define H_RXE_PCQ + +#include <stdint.h> +#include <stdatomic.h> + +/* MUST MATCH kernel struct rfc_pqc in rfc_queue.h */ +struct rfc_queue { + uint32_t log2_elem_size; + uint32_t index_mask; + uint32_t pad_1[30]; + _Atomic(uint32_t) producer_index; + uint32_t pad_2[31]; + _Atomic(uint32_t) consumer_index; + uint32_t pad_3[31]; + uint8_t data[0]; +}; + +static inline int next_index(struct rfc_queue *q, int index) +{ + return (index + 1) & q->index_mask; +} + +static inline int queue_empty(struct rfc_queue *q) +{ + /* Must hold consumer_index lock */ + return ((atomic_load(&q->producer_index) - + atomic_load_explicit(&q->consumer_index, + memory_order_relaxed)) & + q->index_mask) == 0; +} + +static inline int queue_full(struct rfc_queue *q) +{ + /* Must hold producer_index lock */ + return ((atomic_load_explicit(&q->producer_index, + memory_order_relaxed) + + 1 - atomic_load(&q->consumer_index)) & + q->index_mask) == 0; +} + +static inline void advance_producer(struct rfc_queue *q) +{ + /* Must hold producer_index lock */ + atomic_thread_fence(memory_order_release); + atomic_store( + &q->producer_index, + (atomic_load_explicit(&q->producer_index, memory_order_relaxed) + + 1) & + q->index_mask); +} + +static inline void advance_consumer(struct rfc_queue *q) +{ + /* Must hold consumer_index lock */ + atomic_store( + &q->consumer_index, + (atomic_load_explicit(&q->consumer_index, memory_order_relaxed) + + 1) & + q->index_mask); +} + +static inline void *producer_addr(struct rfc_queue *q) +{ + /* Must hold producer_index lock */ + return q->data + ((atomic_load_explicit(&q->producer_index, + memory_order_relaxed) & + q->index_mask) + << q->log2_elem_size); +} + +static inline void *consumer_addr(struct rfc_queue *q) +{ + /* Must hold consumer_index lock */ + return q->data + ((atomic_load_explicit(&q->consumer_index, + memory_order_relaxed) & + q->index_mask) + << q->log2_elem_size); +} + +static inline void *addr_from_index(struct rfc_queue *q, unsigned int index) +{ + return q->data + ((index & q->index_mask) + << q->log2_elem_size); +} + +static inline unsigned int index_from_addr(const struct rfc_queue *q, const void *addr) +{ + return (((uint8_t *)addr - q->data) >> q->log2_elem_size) & q->index_mask; +} + +#endif /* H_RXE_PCQ */ -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html