Re: [PATCH 1/1] librfc for RDMA over Fibre Channel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Apr 18, 2018 at 03:20:13AM -0700, muneendra.kumar@xxxxxxxxxxxx wrote:
> From: Muneendra <muneendra.kumar@xxxxxxxxxxxx>
> 
> This patch adds the following:
> 1. librfc provider library for rdma-core, which acts as user
>    level interface for rdma_rfc kernel module.
> 2. rfc_cfg utility, which helps in loading and configuring
>    the rdma_rfc Kernel module.
> 
> This patch is inspired from librxe which provides the library for Soft RoCE kernel module.
> 
> The Corresponding kernel module(rdma_rfc) changes has been sent for review and the details
> are below.
> 
> https://marc.info/?l=linux-rdma&m=152404459816049&w=2
> Signed-off-by: Muneendra <muneendra.kumar@xxxxxxxxxxxx>
> ---
>  CMakeLists.txt                      |   2 +
>  kernel-headers/CMakeLists.txt       |   2 +
>  kernel-headers/rdma/rdma_user_rfc.h | 179 +++++++
>  providers/rfc/CMakeLists.txt        |   8 +
>  providers/rfc/man/CMakeLists.txt    |   4 +
>  providers/rfc/man/rfc.7             |  77 +++
>  providers/rfc/man/rfc_cfg.8         |  70 +++
>  providers/rfc/rfc-abi.h             |  53 +++
>  providers/rfc/rfc.c                 | 926 ++++++++++++++++++++++++++++++++++++
>  providers/rfc/rfc.h                 | 129 +++++
>  providers/rfc/rfc_cfg.in            | 674 ++++++++++++++++++++++++++
>  providers/rfc/rfc_queue.h           | 128 +++++
>  12 files changed, 2252 insertions(+)
>  create mode 100644 kernel-headers/rdma/rdma_user_rfc.h
>  create mode 100644 providers/rfc/CMakeLists.txt
>  create mode 100644 providers/rfc/man/CMakeLists.txt
>  create mode 100644 providers/rfc/man/rfc.7
>  create mode 100644 providers/rfc/man/rfc_cfg.8
>  create mode 100644 providers/rfc/rfc-abi.h
>  create mode 100644 providers/rfc/rfc.c
>  create mode 100644 providers/rfc/rfc.h
>  create mode 100755 providers/rfc/rfc_cfg.in
>  create mode 100644 providers/rfc/rfc_queue.h
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index 10a687c..0256bbd 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -502,6 +502,8 @@ add_subdirectory(providers/hfi1verbs)
>  add_subdirectory(providers/ipathverbs)
>  add_subdirectory(providers/rxe)
>  add_subdirectory(providers/rxe/man)
> +add_subdirectory(providers/rfc)
> +add_subdirectory(providers/rfc/man)
>  
>  # Binaries
>  add_subdirectory(ibacm) # NO SPARSE
> diff --git a/kernel-headers/CMakeLists.txt b/kernel-headers/CMakeLists.txt
> index 3a526b9..5d280e4 100644
> --- a/kernel-headers/CMakeLists.txt
> +++ b/kernel-headers/CMakeLists.txt
> @@ -22,6 +22,7 @@ publish_internal_headers(rdma
>    rdma/rdma_user_ioctl.h
>    rdma/rdma_user_ioctl_cmds.h
>    rdma/rdma_user_rxe.h
> +  rdma/rdma_user_rfc.h
>    rdma/vmw_pvrdma-abi.h
>    )
>  
> @@ -69,6 +70,7 @@ rdma_kernel_provider_abi(
>    rdma/ocrdma-abi.h
>    rdma/qedr-abi.h
>    rdma/rdma_user_rxe.h
> +  rdma/rdma_user_rfc.h
>    rdma/vmw_pvrdma-abi.h
>    )
>  
> diff --git a/kernel-headers/rdma/rdma_user_rfc.h b/kernel-headers/rdma/rdma_user_rfc.h
> new file mode 100644
> index 0000000..8c6b10d
> --- /dev/null
> +++ b/kernel-headers/rdma/rdma_user_rfc.h
> @@ -0,0 +1,179 @@
> +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
> +/*
> + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.

Do you really want to leave it like this?

> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *	- Redistributions of source code must retain the above
> + *	  copyright notice, this list of conditions and the following
> + *	  disclaimer.
> + *
> + *	- Redistributions in binary form must reproduce the above
> + *	  copyright notice, this list of conditions and the following
> + *	  disclaimer in the documentation and/or other materials
> + *	  provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef RDMA_USER_RXE_H
> +#define RDMA_USER_RXE_H

Suggesting to rename

> +
> +#include <linux/types.h>
> +#include <linux/socket.h>
> +#include <linux/in.h>
> +#include <linux/in6.h>
> +
> +union rfc_gid {
> +	__u8	raw[16];
> +	struct {
> +		__be64	subnet_prefix;
> +		__be64	interface_id;
> +	} global;
> +};

For my understanding, why can't we use ibv_gid?

> +
> +struct rfc_global_route {
> +	union rfc_gid	dgid;
> +	__u32		flow_label;
> +	__u8		sgid_index;
> +	__u8		hop_limit;
> +	__u8		traffic_class;
> +};
> +
> +struct rfc_av {
> +	__u8			port_num;
> +	__u8			network_type;
> +	__u16			reserved1;
> +	__u32			reserved2;
> +	struct rfc_global_route	grh;
> +	union {
> +		struct sockaddr_in	_sockaddr_in;
> +		struct sockaddr_in6	_sockaddr_in6;
> +	} sgid_addr, dgid_addr;
> +};
> +
> +struct rfc_send_wr {
> +	__aligned_u64		wr_id;
> +	__u32			num_sge;
> +	__u32			opcode;
> +	__u32			send_flags;
> +	union {
> +		__be32		imm_data;
> +		__u32		invalidate_rkey;
> +	} ex;
> +	union {
> +		struct {
> +			__aligned_u64 remote_addr;
> +			__u32	rkey;
> +			__u32	reserved;
> +		} rdma;
> +		struct {
> +			__aligned_u64 remote_addr;
> +			__aligned_u64 compare_add;
> +			__aligned_u64 swap;
> +			__u32	rkey;
> +			__u32	reserved;
> +		} atomic;
> +		struct {
> +			__u32	remote_qpn;
> +			__u32	remote_qkey;
> +			__u16	pkey_index;
> +		} ud;
> +		/* reg is only used by the kernel and is not part of the uapi */
> +		struct {
> +			union {
> +				struct ib_mr *mr;
> +				__aligned_u64 reserved;
> +			};
> +			__u32        key;
> +			__u32        access;
> +		} reg;
> +	} wr;
> +};
> +
> +struct rfc_sge {
> +	__aligned_u64 addr;
> +	__u32	length;
> +	__u32	lkey;
> +};
> +
> +struct mminfo {
> +	__aligned_u64  		offset;
> +	__u32			size;
> +	__u32			pad;
> +};
> +
> +struct rfc_dma_info {
> +	__u32			length;
> +	__u32			resid;
> +	__u32			cur_sge;
> +	__u32			num_sge;
> +	__u32			sge_offset;
> +	__u32			reserved;
> +	union {
> +		__u8		inline_data[0];
> +		struct rfc_sge	sge[0];
> +	};
> +};
> +
> +struct rfc_send_wqe {
> +	struct rfc_send_wr	wr;
> +	struct rfc_av		av;
> +	__u32			status;
> +	__u32			state;
> +	__aligned_u64		iova;
> +	__u32			mask;
> +	__u32			first_psn;
> +	__u32			last_psn;
> +	__u32			ack_length;
> +	__u32			ssn;
> +	__u32			has_rd_atomic;
> +	struct rfc_dma_info	dma;
> +};
> +
> +struct rfc_recv_wqe {
> +	__aligned_u64		wr_id;
> +	__u32			num_sge;
> +	__u32			padding;
> +	struct rfc_dma_info	dma;
> +};
> +
> +struct rfc_create_cq_resp {
> +	struct mminfo mi;
> +};
> +
> +struct rfc_resize_cq_resp {
> +	struct mminfo mi;
> +};
> +
> +struct rfc_create_qp_resp {
> +	struct mminfo rq_mi;
> +	struct mminfo sq_mi;
> +};
> +
> +struct rfc_create_srq_resp {
> +	struct mminfo mi;
> +	__u32 srq_num;
> +	__u32 reserved;
> +};
> +
> +struct rfc_modify_srq_cmd {
> +	__aligned_u64 mmap_info_addr;
> +};
> +
> +#endif /* RDMA_USER_RXE_H */
> diff --git a/providers/rfc/CMakeLists.txt b/providers/rfc/CMakeLists.txt
> new file mode 100644
> index 0000000..3123311
> --- /dev/null
> +++ b/providers/rfc/CMakeLists.txt
> @@ -0,0 +1,8 @@
> +rdma_provider(rfc
> +  rfc.c
> +  )
> +rdma_subst_install(FILES "rfc_cfg.in"
> +  RENAME "rfc_cfg"
> +  DESTINATION "${CMAKE_INSTALL_BINDIR}"
> +  PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE
> +  )
> diff --git a/providers/rfc/man/CMakeLists.txt b/providers/rfc/man/CMakeLists.txt
> new file mode 100644
> index 0000000..145855c
> --- /dev/null
> +++ b/providers/rfc/man/CMakeLists.txt
> @@ -0,0 +1,4 @@
> +rdma_man_pages(
> +  rfc.7
> +  rfc_cfg.8
> +)
> diff --git a/providers/rfc/man/rfc.7 b/providers/rfc/man/rfc.7
> new file mode 100644
> index 0000000..594d6cd
> --- /dev/null
> +++ b/providers/rfc/man/rfc.7
> @@ -0,0 +1,77 @@
> +.\" -*- nroff -*-
> +.\"
> +.TH RFC 7 2011-06-29 1.0.0
> +.SH "NAME"
> +rfc \- Software RDMA over FC
> +.SH "SYNOPSIS"
> +\fBmodprobe rdma_rfc\fR
> +.br
> +This is usually performed by a configuration utility (see \fBrfc_cfg\fR(8).)
> +
> +.SH "DESCRIPTION"
> +The rdma_rfc kernel module provides a software implementation of RDMA over
> +Fibre channel. It encapsulates RDMA payloads in FC-NVMe READ/WRITE requests
> +and sends them over Fibre channel fabrics.
> +The InfiniBand (IB) Base Transport Header (BTH) is encapsulated in the FC-NVMe
> +header.
> +
> +Once a RFC instance has been created, communicating via RFC the same as
> +communicating via any OFED compatible Infiniband HCA, albeit in some cases with
> +addressing implications.
> +
> +Verbs applications written over IB verbs should work seamlessly except for the
> +following constraints in current release-
> +1. Partitioning is not supported. RFC module ignores any partition key in BTH.
> +2. Inline and Immediate data size >= 64KB is not supported.
> +3. only Reliable connection(RC) and Unreliable datagram(UD) type queue pairs
> +   are supported.
> +
> +.SH "FILES"
> +.TP
> +\fB/sys/class/infiniband/rfc[0,1,...]\fR
> +Directory that holds RDMA device information. The format is the same as other RDMA devices.
> +
> +.TP
> +\fB/sys/module/rdma_rfc_net/parameters/add\fR
> +Write only file used by \fBrfc_cfg(8)\fR to add new RFC devices to existing Ethernet devices.
> +
> +.TP
> +\fB/sys/module/rdma_rfc_net/parameters/remove\fR
> +Write only file used by \fBrfc_cfg(8)\fR to remove RFC devices.
> +
> +.TP
> +\fB/sys/module/rdma_rfc/parameters/max_qp\fR
> +Read/Write file that sets a limit on the number of QPs allowed per RFC device.
> +
> +.TP
> +\fB/sys/module/rdma_rfc/parameters/max_qp_wr\fR
> +Read/Write file that sets a limit on the number of WRs per QP allowed per RFC device.
> +
> +.TP
> +\fB/sys/module/rdma_rfc/parameters/max_mr\fR
> +Read/Write file that sets a limit on the number of MRs allowed per RFC device.
> +
> +.TP
> +\fB/sys/module/rdma_rfc/parameters/max_fmr\fR
> +Read/Write file that sets a limit on the number of FMRs allowed per RFC device.
> +
> +.TP
> +\fB/sys/module/rdma_rfc/parameters/max_cq\fR
> +Read/Write file that sets a limit on the number of CQs allowed per RFC device.
> +
> +.TP
> +\fB/sys/module/rdma_rfc/parameters/max_log_cqe\fR
> +Read/Write file that sets a limit on the log base 2 of the number of CQEs per CQ allowed per RFC device.
> +
> +.TP
> +\fB/sys/module/rdma_rfc/parameters/max_inline_data\fR
> +Read/Write file that sets a limit on the maximum amount of inline data per WR allowed per RFC device.
> +
> +The above configuration parameters only affect a new RFC instance when it is created not afterwards.
> +
> +.SH "SEE ALSO"
> +.BR rfc_cfg (8),
> +.BR verbs (7),
> +
> +.SH "AUTHORS"
> +Written by Muneendra Kumar, Anand Sundaram, Amit Tyagi at Broadcom INC.
> diff --git a/providers/rfc/man/rfc_cfg.8 b/providers/rfc/man/rfc_cfg.8
> new file mode 100644
> index 0000000..8c12bbf
> --- /dev/null
> +++ b/providers/rfc/man/rfc_cfg.8
> @@ -0,0 +1,70 @@
> +.\" -*- nroff -*-
> +.\"
> +.TH RFC_CFG 8 2011-06-29 1.0.0
> +.SH "NAME"
> +rfc_cfg \- rfc configuration tool for RFC (Soft RFC)
> +.SH "SYNOPSIS"
> +\fBrfc_cfg [status]\fR
> +.br
> +\fBrfc_cfg start\fR [\fB\-p\fR \fIproto\fR]
> +.br
> +\fBrfc_cfg stop\fR
> +.br
> +\fBrfc_cfg persistent\fR
> +.br
> +\fBrfc_cfg add\fR [\fB\-n\fR] \fIethN\fR
> +.br
> +\fBrfc_cfg remove\fR [\fB\-n\fR] \fIethN\fR|\fIrfcN\fR
> +.br
> +.SH "DESCRIPTION"
> +rfc_cfg is the configuration tool for the RFC software implementation of the RFC protocol.  
> +
> +The RFC kernel modules are loaded, configured, reconfigured and unloaded via the various rfc_cfg command options, documented below.
> +
> +.SH "PARAMETERS"
> +.TP
> +\fIethN\fR
> +Network device name as listed in /sys/class/net. Only RFC Ethernet devices are supported; ie. rfcnet0.
> +
> +.TP
> +\fIrfcN\fR
> +RFC device name as listed in /sys/class/infiniband/. Examples are rfc0 or rfc1.
> +
> +.SH "COMMANDS"
> +.TP
> +[\fBstatus\fR]
> +The \fBstatus\fR command prints a table of information on available Ethernet devices and configured RFC instances.  The status display is the default if no options are provided.
> +
> +.TP
> +\fBstart\fR [\fB\-p\fR \fIproto\fR]
> +The \fBstart\fR command loads the RFC modules and configures any persistent instances.
> +
> +.TP
> +\fBstop\fR
> +The \fBstop\fR command unconfigures all RFC instances and attempts to unload the kernel modules.
> +
> +.TP
> +\fBpersistent\fR
> +The \fBpersistent\fR command prints the list of Ethernet devices for which a RFC instance is persistently configured.
> +
> +.TP
> +\fBadd\fR [\fB\-n\fR] \fIethN\fR
> +The \fBadd\fR command will only configure a RFC instance on RFC Ethernet device \fIrfcnetN\fR (e.g. rfcnet0).  The RFC modules must have already been loaded via \fBrfc_cfg start\fR.
> +
> +The default behavior is to add \fIrfcnetN\fR to a file of persistent configurations and the same RFC device will be configured the next time that \fBrfc_cfg start\fR is run.  If the \fB-n\fR option is included the device is not added to the persistence file.
> +
> +.TP
> +\fBremove\fR [\fB\-n\fR] \fIethN\fR|\fIrfcN\fR
> +The \fBremove\fR command will remove the specified RFC instance.  The parameter must match a currently active rfcnetN or rfcN name.
> +
> +If the \fB-n\fR option is included the RFC device will be removed but not removed from the persistent state. So it will be recreated the next time that \fBrfc_cfg start\fR is run.
> +
> +.SH "FILES"
> +.TP
> +\fB[PREFIX]/etc/rfc.conf\fR
> +RFC configuration file. Contains the list of persistent RFC instances.  All persistent RFC instances can be removed by deleting this file (note this will take effect on the next "rfc_cfg start" -- to remove actively configured instances, you must "rfc_cfg stop").
> +
> +.SH "SEE ALSO"
> +.BR rfc (7),
> +.SH "AUTHORS"
> +Written by Muneendra Kumar, Anand Sundaram, Amit Tyagi at Broadcom INC.
> diff --git a/providers/rfc/rfc-abi.h b/providers/rfc/rfc-abi.h
> new file mode 100644
> index 0000000..a36a9ef
> --- /dev/null
> +++ b/providers/rfc/rfc-abi.h
> @@ -0,0 +1,53 @@
> +/*
> + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved.
> + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + *
> + */
> +
> +#ifndef RXE_ABI_H
> +#define RXE_ABI_H
> +
> +#include <infiniband/kern-abi.h>
> +#include <rdma/rdma_user_rfc.h>
> +#include <kernel-abi/rdma_user_rfc.h>
> +
> +DECLARE_DRV_CMD(urfc_create_cq, IB_USER_VERBS_CMD_CREATE_CQ,
> +		empty, rfc_create_cq_resp);
> +DECLARE_DRV_CMD(urfc_create_qp, IB_USER_VERBS_CMD_CREATE_QP,
> +		empty, rfc_create_qp_resp);
> +DECLARE_DRV_CMD(urfc_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ,
> +		empty, rfc_create_srq_resp);
> +DECLARE_DRV_CMD(urfc_modify_srq, IB_USER_VERBS_CMD_MODIFY_SRQ,
> +		rfc_modify_srq_cmd, empty);
> +DECLARE_DRV_CMD(urfc_resize_cq, IB_USER_VERBS_CMD_RESIZE_CQ,
> +		empty, rfc_resize_cq_resp);
> +
> +#endif /* RXE_ABI_H */
> diff --git a/providers/rfc/rfc.c b/providers/rfc/rfc.c
> new file mode 100644
> index 0000000..0611bc1
> --- /dev/null
> +++ b/providers/rfc/rfc.c
> @@ -0,0 +1,926 @@
> +/*
> + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved.
> + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved.
> + * Copyright (C) 2006-2007 QLogic Corporation, All rights reserved.
> + * Copyright (c) 2005. PathScale, Inc. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *	- Redistributions of source code must retain the above
> + *	  copyright notice, this list of conditions and the following
> + *	  disclaimer.
> + *
> + *	- Redistributions in binary form must reproduce the above
> + *	  copyright notice, this list of conditions and the following
> + *	  disclaimer in the documentation and/or other materials
> + *	  provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include <config.h>
> +
> +#include <endian.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <pthread.h>
> +#include <netinet/in.h>
> +#include <sys/mman.h>
> +#include <errno.h>
> +
> +#include <endian.h>
> +#include <pthread.h>
> +#include <stddef.h>
> +
> +#include <infiniband/driver.h>
> +#include <infiniband/verbs.h>
> +
> +#include "rfc_queue.h"
> +#include "rfc-abi.h"
> +#include "rfc.h"
> +
> +static const struct verbs_match_ent hca_table[] = {
> +	/* FIXME: rfc needs a more reliable way to detect the rfc device */
> +	VERBS_NAME_MATCH("rfc", NULL),
> +	{},
> +};
> +
> +static int rfc_query_device(struct ibv_context *context,
> +			    struct ibv_device_attr *attr)
> +{
> +	struct ibv_query_device cmd;
> +	uint64_t raw_fw_ver;
> +	unsigned major, minor, sub_minor;
> +	int ret;
> +
> +	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver,
> +				   &cmd, sizeof cmd);
> +	if (ret)
> +		return ret;
> +
> +	major = (raw_fw_ver >> 32) & 0xffff;
> +	minor = (raw_fw_ver >> 16) & 0xffff;
> +	sub_minor = raw_fw_ver & 0xffff;
> +
> +	snprintf(attr->fw_ver, sizeof attr->fw_ver,
> +		 "%d.%d.%d", major, minor, sub_minor);
> +
> +	return 0;
> +}
> +
> +static int rfc_query_port(struct ibv_context *context, uint8_t port,
> +			  struct ibv_port_attr *attr)
> +{
> +	struct ibv_query_port cmd;
> +
> +	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
> +}
> +
> +static struct ibv_pd *rfc_alloc_pd(struct ibv_context *context)
> +{
> +	struct ibv_alloc_pd cmd;
> +	struct ib_uverbs_alloc_pd_resp resp;
> +	struct ibv_pd *pd;
> +
> +	pd = malloc(sizeof *pd);
> +	if (!pd)
> +		return NULL;
> +
> +	if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof cmd, &resp, sizeof resp)) {
> +		free(pd);
> +		return NULL;
> +	}
> +
> +	return pd;
> +}
> +
> +static int rfc_dealloc_pd(struct ibv_pd *pd)
> +{
> +	int ret;
> +
> +	ret = ibv_cmd_dealloc_pd(pd);
> +	if (!ret)
> +		free(pd);
> +
> +	return ret;
> +}
> +
> +static struct ibv_mr *rfc_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
> +				 int access)
> +{
> +	struct ibv_mr *mr;
> +	struct ibv_reg_mr cmd;
> +	struct ib_uverbs_reg_mr_resp resp;
> +	int ret;
> +
> +	mr = malloc(sizeof *mr);
> +	if (!mr) {
> +		return NULL;
> +	}

AFAIK curly bracers are not needed here.

> +
> +	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t)addr, access, mr,
> +			     &cmd, sizeof cmd, &resp, sizeof resp);
> +	if (ret) {
> +		free(mr);
> +		return NULL;
> +	}
> +
> +	return mr;
> +}
> +
> +static int rfc_dereg_mr(struct ibv_mr *mr)
> +{
> +	int ret;
> +
> +	ret = ibv_cmd_dereg_mr(mr);
> +	if (ret)
> +		return ret;
> +
> +	free(mr);
> +	return 0;
> +}
> +
> +static struct ibv_cq *rfc_create_cq(struct ibv_context *context, int cqe,
> +				    struct ibv_comp_channel *channel,
> +				    int comp_vector)
> +{
> +	struct rfc_cq *cq;
> +	struct urfc_create_cq_resp resp;
> +	int ret;
> +
> +	cq = malloc(sizeof *cq);
> +	if (!cq) {
> +		return NULL;
> +	}
> +
> +	ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector,
> +				&cq->ibv_cq, NULL, 0,
> +				&resp.ibv_resp, sizeof resp);
> +	if (ret) {
> +		free(cq);
> +		return NULL;
> +	}
> +
> +	cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +			 context->cmd_fd, resp.mi.offset);
> +	if ((void *)cq->queue == MAP_FAILED) {
> +		ibv_cmd_destroy_cq(&cq->ibv_cq);
> +		free(cq);
> +		return NULL;
> +	}
> +
> +	cq->mmap_info = resp.mi;
> +	pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE);
> +
> +	return &cq->ibv_cq;
> +}
> +
> +static int rfc_resize_cq(struct ibv_cq *ibcq, int cqe)
> +{
> +	struct rfc_cq *cq = to_rcq(ibcq);
> +	struct ibv_resize_cq cmd;
> +	struct urfc_resize_cq_resp resp;
> +	int ret;
> +
> +	pthread_spin_lock(&cq->lock);
> +
> +	ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd,
> +				&resp.ibv_resp, sizeof resp);
> +	if (ret) {
> +		pthread_spin_unlock(&cq->lock);
> +		return ret;
> +	}
> +
> +	munmap(cq->queue, cq->mmap_info.size);
> +
> +	cq->queue = mmap(NULL, resp.mi.size,
> +			 PROT_READ | PROT_WRITE, MAP_SHARED,
> +			 ibcq->context->cmd_fd, resp.mi.offset);
> +
> +	ret = errno;
> +	pthread_spin_unlock(&cq->lock);
> +
> +	if ((void *)cq->queue == MAP_FAILED) {
> +		cq->queue = NULL;
> +		cq->mmap_info.size = 0;
> +		return ret;
> +	}
> +
> +	cq->mmap_info = resp.mi;
> +
> +	return 0;
> +}
> +
> +static int rfc_destroy_cq(struct ibv_cq *ibcq)
> +{
> +	struct rfc_cq *cq = to_rcq(ibcq);
> +	int ret;
> +
> +	ret = ibv_cmd_destroy_cq(ibcq);
> +	if (ret)
> +		return ret;
> +
> +	if (cq->mmap_info.size)
> +		munmap(cq->queue, cq->mmap_info.size);
> +	free(cq);
> +
> +	return 0;
> +}
> +
> +static int rfc_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
> +{
> +	struct rfc_cq *cq = to_rcq(ibcq);
> +	struct rfc_queue *q;
> +	int npolled;
> +	uint8_t *src;
> +
> +	pthread_spin_lock(&cq->lock);
> +	q = cq->queue;
> +
> +	for (npolled = 0; npolled < ne; ++npolled, ++wc) {
> +		if (queue_empty(q))
> +			break;
> +
> +		atomic_thread_fence(memory_order_acquire);
> +		src = consumer_addr(q);
> +		memcpy(wc, src, sizeof(*wc));
> +		advance_consumer(q);
> +	}
> +
> +	pthread_spin_unlock(&cq->lock);
> +	return npolled;
> +}
> +
> +static struct ibv_srq *rfc_create_srq(struct ibv_pd *pd,
> +				      struct ibv_srq_init_attr *attr)
> +{
> +	struct rfc_srq *srq;
> +	struct ibv_create_srq cmd;
> +	struct urfc_create_srq_resp resp;
> +	int ret;
> +
> +	srq = malloc(sizeof *srq);
> +	if (srq == NULL) {
> +		return NULL;
> +	}

AFAIK curly bracers are not needed here.

> +
> +	ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof cmd,
> +				 &resp.ibv_resp, sizeof resp);
> +	if (ret) {
> +		free(srq);
> +		return NULL;
> +	}
> +
> +	srq->rq.queue = mmap(NULL, resp.mi.size,
> +			     PROT_READ | PROT_WRITE, MAP_SHARED,
> +			     pd->context->cmd_fd, resp.mi.offset);
> +	if ((void *)srq->rq.queue == MAP_FAILED) {
> +		ibv_cmd_destroy_srq(&srq->ibv_srq);
> +		free(srq);
> +		return NULL;
> +	}

Usually when there are more than one exit flows from from a function where
each one repeats the cleanup of its predecessor is see that goto with
labels is used.

> +
> +	srq->mmap_info = resp.mi;
> +	srq->rq.max_sge = attr->attr.max_sge;
> +	pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE);
> +
> +	return &srq->ibv_srq;
> +}
> +
> +static int rfc_modify_srq(struct ibv_srq *ibsrq,
> +		   struct ibv_srq_attr *attr, int attr_mask)
> +{
> +	struct rfc_srq *srq = to_rsrq(ibsrq);
> +	struct urfc_modify_srq cmd;
> +	int rc = 0;
> +	struct mminfo mi;
> +
> +	mi.offset = 0;
> +	mi.size = 0;
> +
> +	if (attr_mask & IBV_SRQ_MAX_WR)
> +		pthread_spin_lock(&srq->rq.lock);
> +
> +	cmd.mmap_info_addr = (__u64)(uintptr_t) & mi;
> +	rc = ibv_cmd_modify_srq(ibsrq, attr, attr_mask,
> +				&cmd.ibv_cmd, sizeof cmd);
> +	if (rc)
> +		goto out;
> +
> +	if (attr_mask & IBV_SRQ_MAX_WR) {
> +		(void)munmap(srq->rq.queue, srq->mmap_info.size);
> +		srq->rq.queue = mmap(NULL, mi.size,
> +				     PROT_READ | PROT_WRITE, MAP_SHARED,
> +				     ibsrq->context->cmd_fd, mi.offset);
> +
> +		if ((void *)srq->rq.queue == MAP_FAILED) {
> +			rc = errno;
> +			srq->rq.queue = NULL;
> +			srq->mmap_info.size = 0;
> +			goto out;
> +		}
> +
> +		srq->mmap_info = mi;
> +	}
> +
> +out:
> +	if (attr_mask & IBV_SRQ_MAX_WR)
> +		pthread_spin_unlock(&srq->rq.lock);
> +	return rc;
> +}
> +
> +static int rfc_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr)
> +{
> +	struct ibv_query_srq cmd;
> +
> +	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
> +}
> +
> +static int rfc_destroy_srq(struct ibv_srq *ibvsrq)
> +{
> +	int ret;
> +	struct rfc_srq *srq = to_rsrq(ibvsrq);
> +	struct rfc_queue *q = srq->rq.queue;
> +
> +	ret = ibv_cmd_destroy_srq(ibvsrq);
> +	if (!ret) {
> +		if (srq->mmap_info.size)
> +			munmap(q, srq->mmap_info.size);
> +		free(srq);
> +	}
> +
> +	return ret;
> +}
> +
> +static int rfc_post_one_recv(struct rfc_wq *rq, struct ibv_recv_wr *recv_wr)
> +{
> +	int i;
> +	struct rfc_recv_wqe *wqe;
> +	struct rfc_queue *q = rq->queue;
> +	int length = 0;
> +	int rc = 0;
> +
> +	if (queue_full(q)) {
> +		rc  = -ENOMEM;
> +		goto out;
> +	}
> +
> +	if (recv_wr->num_sge > rq->max_sge) {
> +		rc = -EINVAL;
> +		goto out;
> +	}
> +
> +	wqe = (struct rfc_recv_wqe *)producer_addr(q);
> +
> +	wqe->wr_id = recv_wr->wr_id;
> +	wqe->num_sge = recv_wr->num_sge;
> +
> +	memcpy(wqe->dma.sge, recv_wr->sg_list,
> +	       wqe->num_sge*sizeof(*wqe->dma.sge));
> +
> +	for (i = 0; i < wqe->num_sge; i++) {
> +		length += wqe->dma.sge[i].length;
> +	}
> +
> +	wqe->dma.length = length;
> +	wqe->dma.resid = length;
> +	wqe->dma.cur_sge = 0;
> +	wqe->dma.num_sge = wqe->num_sge;
> +	wqe->dma.sge_offset = 0;
> +
> +	advance_producer(q);
> +
> +out:
> +	return rc;
> +}
> +
> +static int rfc_post_srq_recv(struct ibv_srq *ibvsrq,
> +			     struct ibv_recv_wr *recv_wr,
> +			     struct ibv_recv_wr **bad_recv_wr)
> +{
> +	struct rfc_srq *srq = to_rsrq(ibvsrq);
> +	int rc = 0;
> +
> +	pthread_spin_lock(&srq->rq.lock);
> +
> +	while (recv_wr) {
> +		rc = rfc_post_one_recv(&srq->rq, recv_wr);
> +		if (rc) {
> +			*bad_recv_wr = recv_wr;
> +			break;
> +		}
> +
> +		recv_wr = recv_wr->next;
> +	}
> +
> +	pthread_spin_unlock(&srq->rq.lock);
> +
> +	return rc;
> +}
> +
> +static struct ibv_qp *rfc_create_qp(struct ibv_pd *pd,
> +				    struct ibv_qp_init_attr *attr)
> +{
> +	struct ibv_create_qp cmd;
> +	struct urfc_create_qp_resp resp;
> +	struct rfc_qp *qp;
> +	int ret;
> +
> +	qp = malloc(sizeof *qp);
> +	if (!qp) {
> +		return NULL;
> +	}

Curly bracers.

> +
> +	ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd,
> +				&resp.ibv_resp, sizeof resp);
> +	if (ret) {
> +		free(qp);
> +		return NULL;
> +	}
> +
> +	if (attr->srq) {
> +		qp->rq.max_sge = 0;
> +		qp->rq.queue = NULL;
> +		qp->rq_mmap_info.size = 0;
> +	} else {
> +		qp->rq.max_sge = attr->cap.max_recv_sge;
> +		qp->rq.queue = mmap(NULL, resp.rq_mi.size, PROT_READ | PROT_WRITE,
> +				    MAP_SHARED,
> +				    pd->context->cmd_fd, resp.rq_mi.offset);
> +		if ((void *)qp->rq.queue == MAP_FAILED) {
> +			ibv_cmd_destroy_qp(&qp->ibv_qp);
> +			free(qp);
> +			return NULL;
> +		}
> +
> +		qp->rq_mmap_info = resp.rq_mi;
> +		pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE);
> +	}
> +
> +	qp->sq.max_sge = attr->cap.max_send_sge;
> +	qp->sq.max_inline = attr->cap.max_inline_data;
> +	qp->sq.queue = mmap(NULL, resp.sq_mi.size, PROT_READ | PROT_WRITE,
> +			    MAP_SHARED,
> +			    pd->context->cmd_fd, resp.sq_mi.offset);
> +	if ((void *)qp->sq.queue == MAP_FAILED) {
> +		if (qp->rq_mmap_info.size)
> +			munmap(qp->rq.queue, qp->rq_mmap_info.size);
> +		ibv_cmd_destroy_qp(&qp->ibv_qp);
> +		free(qp);

Suggesting to use the goto-label pattern.

> +		return NULL;
> +	}
> +
> +	qp->sq_mmap_info = resp.sq_mi;
> +	pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE);
> +
> +	return &qp->ibv_qp;
> +}
> +
> +static int rfc_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
> +			int attr_mask,
> +			struct ibv_qp_init_attr *init_attr)
> +{
> +	struct ibv_query_qp cmd;
> +
> +	return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr,
> +				&cmd, sizeof cmd);
> +}
> +
> +static int rfc_modify_qp(struct ibv_qp *ibvqp,
> +			 struct ibv_qp_attr *attr,
> +			 int attr_mask)
> +{
> +	struct ibv_modify_qp cmd = {};
> +
> +	return ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof cmd);
> +}
> +
> +static int rfc_destroy_qp(struct ibv_qp *ibv_qp)
> +{
> +	int ret;
> +	struct rfc_qp *qp = to_rqp(ibv_qp);
> +
> +	ret = ibv_cmd_destroy_qp(ibv_qp);
> +	if (!ret) {
> +		if (qp->rq_mmap_info.size)
> +			munmap(qp->rq.queue, qp->rq_mmap_info.size);
> +		if (qp->sq_mmap_info.size)
> +			munmap(qp->sq.queue, qp->sq_mmap_info.size);
> +
> +		free(qp);
> +	}
> +
> +	return ret;
> +}
> +
> +/* basic sanity checks for send work request */
> +static int validate_send_wr(struct rfc_wq *sq, struct ibv_send_wr *ibwr,
> +			    unsigned int length)
> +{
> +	enum ibv_wr_opcode opcode = ibwr->opcode;
> +
> +	if (ibwr->num_sge > sq->max_sge)
> +		return -EINVAL;
> +
> +	if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP)
> +	    || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD))
> +		if (length < 8 || ibwr->wr.atomic.remote_addr & 0x7)
> +			return -EINVAL;
> +
> +	if ((ibwr->send_flags & IBV_SEND_INLINE) && (length > sq->max_inline))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static void convert_send_wr(struct rfc_send_wr *kwr, struct ibv_send_wr *uwr)
> +{
> +	memset(kwr, 0, sizeof(*kwr));
> +
> +	kwr->wr_id		= uwr->wr_id;
> +	kwr->num_sge		= uwr->num_sge;
> +	kwr->opcode		= uwr->opcode;
> +	kwr->send_flags		= uwr->send_flags;
> +	kwr->ex.imm_data	= uwr->imm_data;
> +
> +	switch(uwr->opcode) {
> +	case IBV_WR_RDMA_WRITE:
> +	case IBV_WR_RDMA_WRITE_WITH_IMM:
> +	case IBV_WR_RDMA_READ:
> +		kwr->wr.rdma.remote_addr	= uwr->wr.rdma.remote_addr;
> +		kwr->wr.rdma.rkey		= uwr->wr.rdma.rkey;
> +		break;
> +
> +	case IBV_WR_SEND:
> +	case IBV_WR_SEND_WITH_IMM:
> +		kwr->wr.ud.remote_qpn		= uwr->wr.ud.remote_qpn;
> +		kwr->wr.ud.remote_qkey		= uwr->wr.ud.remote_qkey;
> +		break;
> +
> +	case IBV_WR_ATOMIC_CMP_AND_SWP:
> +	case IBV_WR_ATOMIC_FETCH_AND_ADD:
> +		kwr->wr.atomic.remote_addr	= uwr->wr.atomic.remote_addr;
> +		kwr->wr.atomic.compare_add	= uwr->wr.atomic.compare_add;
> +		kwr->wr.atomic.swap		= uwr->wr.atomic.swap;
> +		kwr->wr.atomic.rkey		= uwr->wr.atomic.rkey;
> +		break;
> +
> +	case IBV_WR_LOCAL_INV:
> +	case IBV_WR_BIND_MW:
> +	case IBV_WR_SEND_WITH_INV:
> +	case IBV_WR_TSO:
> +		break;
> +	}
> +}
> +
> +static int init_send_wqe(struct rfc_qp *qp, struct rfc_wq *sq,
> +		  struct ibv_send_wr *ibwr, unsigned int length,
> +		  struct rfc_send_wqe *wqe)
> +{
> +	int num_sge = ibwr->num_sge;
> +	int i;
> +	unsigned int opcode = ibwr->opcode;
> +
> +	convert_send_wr(&wqe->wr, ibwr);
> +
> +	if (qp_type(qp) == IBV_QPT_UD)
> +		memcpy(&wqe->av, &to_rah(ibwr->wr.ud.ah)->av,
> +		       sizeof(struct rfc_av));
> +
> +	if (ibwr->send_flags & IBV_SEND_INLINE) {
> +		uint8_t *inline_data = wqe->dma.inline_data;
> +
> +		for (i = 0; i < num_sge; i++) {
> +			memcpy(inline_data,
> +			       (uint8_t *)(long)ibwr->sg_list[i].addr,
> +			       ibwr->sg_list[i].length);
> +			inline_data += ibwr->sg_list[i].length;
> +		}
> +	} else
> +		memcpy(wqe->dma.sge, ibwr->sg_list,
> +		       num_sge*sizeof(struct ibv_sge));
> +
> +	if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP)
> +	    || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD))
> +		wqe->iova	= ibwr->wr.atomic.remote_addr;
> +	else
> +		wqe->iova	= ibwr->wr.rdma.remote_addr;
> +	wqe->dma.length		= length;
> +	wqe->dma.resid		= length;
> +	wqe->dma.num_sge	= num_sge;
> +	wqe->dma.cur_sge	= 0;
> +	wqe->dma.sge_offset	= 0;
> +	wqe->state		= 0;
> +	wqe->ssn		= qp->ssn++;
> +
> +	return 0;

Please make this function return void.

> +}
> +
> +static int post_one_send(struct rfc_qp *qp, struct rfc_wq *sq,
> +			 struct ibv_send_wr *ibwr)
> +{
> +	int err;
> +	struct rfc_send_wqe *wqe;
> +	unsigned int length = 0;
> +	int i;
> +
> +	for (i = 0; i < ibwr->num_sge; i++)
> +		length += ibwr->sg_list[i].length;
> +
> +	err = validate_send_wr(sq, ibwr, length);
> +	if (err) {
> +		printf("validate send failed\n");
> +		return err;
> +	}
> +
> +	wqe = (struct rfc_send_wqe *)producer_addr(sq->queue);
> +
> +	err = init_send_wqe(qp, sq, ibwr, length, wqe);
> +	if (err)
> +		return err;
> +
> +	if (queue_full(sq->queue))
> +		return -ENOMEM;
> +
> +	advance_producer(sq->queue);
> +
> +	return 0;
> +}
> +
> +/* send a null post send as a doorbell */
> +static int post_send_db(struct ibv_qp *ibqp)
> +{
> +	struct ibv_post_send cmd;
> +	struct ib_uverbs_post_send_resp resp;
> +
> +	cmd.hdr.command	= IB_USER_VERBS_CMD_POST_SEND;
> +	cmd.hdr.in_words = sizeof(cmd) / 4;
> +	cmd.hdr.out_words = sizeof(resp) / 4;
> +	cmd.response	= (uintptr_t)&resp;
> +	cmd.qp_handle	= ibqp->handle;
> +	cmd.wr_count	= 0;
> +	cmd.sge_count	= 0;
> +	cmd.wqe_size	= sizeof(struct ibv_send_wr);
> +
> +	if (write(ibqp->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd))
> +		return errno;
> +
> +	return 0;
> +}
> +
> +/* this API does not make a distinction between
> +   restartable and non-restartable errors */
> +static int rfc_post_send(struct ibv_qp *ibqp,
> +			 struct ibv_send_wr *wr_list,
> +			 struct ibv_send_wr **bad_wr)
> +{
> +	int rc = 0;
> +	int err;
> +	struct rfc_qp *qp = to_rqp(ibqp);
> +	struct rfc_wq *sq = &qp->sq;
> +
> +	if (!bad_wr)
> +		return EINVAL;
> +
> +	*bad_wr = NULL;
> +
> +	if (!sq || !wr_list || !sq->queue)
> +	 	return EINVAL;
> +
> +	pthread_spin_lock(&sq->lock);
> +
> +	while (wr_list) {
> +		rc = post_one_send(qp, sq, wr_list);
> +		if (rc) {
> +			*bad_wr = wr_list;
> +			break;
> +		}
> +
> +		wr_list = wr_list->next;
> +	}
> +
> +	pthread_spin_unlock(&sq->lock);
> +
> +	err =  post_send_db(ibqp);

Extra space.

> +	return err ? err : rc;
> +}
> +
> +static int rfc_post_recv(struct ibv_qp *ibqp,
> +			 struct ibv_recv_wr *recv_wr,
> +			 struct ibv_recv_wr **bad_wr)
> +{
> +	int rc = 0;
> +	struct rfc_qp *qp = to_rqp(ibqp);
> +	struct rfc_wq *rq = &qp->rq;
> +
> +	if (!bad_wr)
> +		return EINVAL;
> +
> +	*bad_wr = NULL;
> +
> +	if (!rq || !recv_wr || !rq->queue)
> +		return EINVAL;
> +
> +	pthread_spin_lock(&rq->lock);
> +
> +	while (recv_wr) {
> +		rc = rfc_post_one_recv(rq, recv_wr);
> +		if (rc) {
> +			*bad_wr = recv_wr;
> +			break;
> +		}
> +
> +		recv_wr = recv_wr->next;
> +	}
> +
> +	pthread_spin_unlock(&rq->lock);
> +
> +	return rc;
> +}
> +
> +static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
> +{
> +	return IN6_IS_ADDR_V4MAPPED(a);
> +}
> +
> +typedef typeof(((struct rfc_av *)0)->sgid_addr) sockaddr_union_t;
> +
> +static inline int rdma_gid2ip(sockaddr_union_t *out, union ibv_gid *gid)
> +{
> +	if (ipv6_addr_v4mapped((struct in6_addr *)gid)) {
> +		memset(&out->_sockaddr_in, 0, sizeof(out->_sockaddr_in));
> +		memcpy(&out->_sockaddr_in.sin_addr.s_addr, gid->raw + 12, 4);
> +	} else {
> +		memset(&out->_sockaddr_in6, 0, sizeof(out->_sockaddr_in6));
> +		out->_sockaddr_in6.sin6_family = AF_INET6;
> +		memcpy(&out->_sockaddr_in6.sin6_addr.s6_addr, gid->raw, 16);
> +	}
> +	return 0;

Please make this function return void.

> +}
> +
> +static struct ibv_ah *rfc_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
> +{
> +	int err;
> +	struct rfc_ah *ah;
> +	struct rfc_av *av;
> +	union ibv_gid sgid;
> +	struct ib_uverbs_create_ah_resp resp;
> +
> +	err = ibv_query_gid(pd->context, attr->port_num, attr->grh.sgid_index,
> +			    &sgid);
> +	if (err) {
> +		fprintf(stderr, "rfc: Failed to query sgid.\n");

Not sure we like to print from library (is it library?).

Also, what is so special with this error where others do not prints?

> +		return NULL;
> +	}
> +
> +	ah = malloc(sizeof *ah);
> +	if (ah == NULL)
> +		return NULL;
> +
> +	av = &ah->av;
> +	av->port_num = attr->port_num;
> +	memcpy(&av->grh, &attr->grh, sizeof(attr->grh));
> +	av->network_type =
> +		ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ?
> +		RDMA_NETWORK_IPV4 : RDMA_NETWORK_IPV6;
> +
> +	rdma_gid2ip(&av->sgid_addr, &sgid);
> +	rdma_gid2ip(&av->dgid_addr, &attr->grh.dgid);
> +
> +	memset(&resp, 0, sizeof(resp));
> +	if (ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp, sizeof(resp))) {
> +		free(ah);
> +		return NULL;
> +	}
> +
> +	return &ah->ibv_ah;
> +}
> +
> +static int rfc_destroy_ah(struct ibv_ah *ibah)
> +{
> +	int ret;
> +	struct rfc_ah *ah = to_rah(ibah);
> +
> +	ret = ibv_cmd_destroy_ah(&ah->ibv_ah);
> +	if (ret)
> +		return ret;
> +
> +	free(ah);
> +	return 0;
> +}
> +
> +static const struct verbs_context_ops rfc_ctx_ops = {
> +	.query_device = rfc_query_device,
> +	.query_port = rfc_query_port,
> +	.alloc_pd = rfc_alloc_pd,
> +	.dealloc_pd = rfc_dealloc_pd,
> +	.reg_mr = rfc_reg_mr,
> +	.dereg_mr = rfc_dereg_mr,
> +	.create_cq = rfc_create_cq,
> +	.poll_cq = rfc_poll_cq,
> +	.req_notify_cq = ibv_cmd_req_notify_cq,
> +	.resize_cq = rfc_resize_cq,
> +	.destroy_cq = rfc_destroy_cq,
> +	.create_srq = rfc_create_srq,
> +	.modify_srq = rfc_modify_srq,
> +	.query_srq = rfc_query_srq,
> +	.destroy_srq = rfc_destroy_srq,
> +	.post_srq_recv = rfc_post_srq_recv,
> +	.create_qp = rfc_create_qp,
> +	.query_qp = rfc_query_qp,
> +	.modify_qp = rfc_modify_qp,
> +	.destroy_qp = rfc_destroy_qp,
> +	.post_send = rfc_post_send,
> +	.post_recv = rfc_post_recv,
> +	.create_ah = rfc_create_ah,
> +	.destroy_ah = rfc_destroy_ah,
> +	.attach_mcast = ibv_cmd_attach_mcast,
> +	.detach_mcast = ibv_cmd_detach_mcast
> +};
> +
> +static struct verbs_context *rfc_alloc_context(struct ibv_device *ibdev,
> +					       int cmd_fd)
> +{
> +	struct rfc_context *context;
> +	struct ibv_get_context cmd;
> +	struct ib_uverbs_get_context_resp resp;
> +
> +	context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx,
> +					       RDMA_DRIVER_RXE);
> +	if (!context)
> +		return NULL;
> +
> +	if (ibv_cmd_get_context(&context->ibv_ctx, &cmd,
> +				sizeof cmd, &resp, sizeof resp))
> +		goto out;
> +
> +	verbs_set_ops(&context->ibv_ctx, &rfc_ctx_ops);
> +
> +	return &context->ibv_ctx;
> +
> +out:
> +	verbs_uninit_context(&context->ibv_ctx);
> +	free(context);
> +	return NULL;
> +}
> +
> +static void rfc_free_context(struct ibv_context *ibctx)
> +{
> +	struct rfc_context *context = to_rctx(ibctx);
> +
> +	verbs_uninit_context(&context->ibv_ctx);
> +	free(context);
> +}
> +
> +static void rfc_uninit_device(struct verbs_device *verbs_device)
> +{
> +	struct rfc_device *dev = to_rdev(&verbs_device->device);
> +
> +	free(dev);
> +}
> +
> +static struct verbs_device *rfc_device_alloc(struct verbs_sysfs_dev *sysfs_dev)
> +{
> +	struct rfc_device *dev;
> +	dev = calloc(1, sizeof(*dev));
> +	if (!dev)
> +		return NULL;
> +
> +	dev->abi_version = sysfs_dev->abi_ver;
> +
> +	return &dev->ibv_dev;
> +}
> +
> +static const struct verbs_device_ops rfc_dev_ops = {
> +	.name = "rfc",
> +	/*
> +	 * For 64 bit machines ABI version 1 and 2 are the same. Otherwise 32
> +	 * bit machines require ABI version 2 which guarentees the user and
> +	 * kernel use the same ABI.
> +	 */
> +	.match_min_abi_version = sizeof(void *) == 8?1:2,
> +	.match_max_abi_version = 2,
> +	.match_table = hca_table,
> +	.alloc_device = rfc_device_alloc,
> +	.uninit_device = rfc_uninit_device,
> +	.alloc_context = rfc_alloc_context,
> +	.free_context = rfc_free_context,
> +};
> +PROVIDER_DRIVER(rfc_dev_ops);
> diff --git a/providers/rfc/rfc.h b/providers/rfc/rfc.h
> new file mode 100644
> index 0000000..8313b19
> --- /dev/null
> +++ b/providers/rfc/rfc.h
> @@ -0,0 +1,129 @@
> +/*
> + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved.
> + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved.
> + * Copyright (c) 2006-2007 QLogic Corp. All rights reserved.
> + * Copyright (c) 2005. PathScale, Inc. All rights reserved.

Don't you want to add Broadcom here?

> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *	- Redistributions of source code must retain the above
> + *	  copyright notice, this list of conditions and the following
> + *	  disclaimer.
> + *
> + *	- Redistributions in binary form must reproduce the above
> + *	  copyright notice, this list of conditions and the following
> + *	  disclaimer in the documentation and/or other materials
> + *	  provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef RXE_H
> +#define RXE_H

RXE?

> +
> +#include <infiniband/driver.h>
> +#include <sys/socket.h>
> +#include <netinet/in.h>
> +#include <rdma/rdma_user_rfc.h> /* struct rfc_av */
> +#include "rfc-abi.h"
> +
> +enum rdma_network_type {
> +	RDMA_NETWORK_IB,
> +	RDMA_NETWORK_IPV4,
> +	RDMA_NETWORK_IPV6
> +};

If both RXE and RFC uses it, can we move it to generic place?

> +
> +struct rfc_device {
> +	struct verbs_device	ibv_dev;
> +	int	abi_version;
> +};
> +
> +struct rfc_context {
> +	struct verbs_context	ibv_ctx;
> +};
> +
> +struct rfc_cq {
> +	struct ibv_cq		ibv_cq;
> +	struct mminfo		mmap_info;
> +	struct rfc_queue		*queue;
> +	pthread_spinlock_t	lock;
> +};
> +
> +struct rfc_ah {
> +	struct ibv_ah		ibv_ah;
> +	struct rfc_av		av;
> +};
> +
> +struct rfc_wq {
> +	struct rfc_queue	*queue;
> +	pthread_spinlock_t	lock;
> +	unsigned int		max_sge;
> +	unsigned int		max_inline;
> +};
> +
> +struct rfc_qp {
> +	struct ibv_qp		ibv_qp;
> +	struct mminfo		rq_mmap_info;
> +	struct rfc_wq		rq;
> +	struct mminfo		sq_mmap_info;
> +	struct rfc_wq		sq;
> +	unsigned int		ssn;
> +};
> +
> +#define qp_type(qp)		((qp)->ibv_qp.qp_type)
> +
> +struct rfc_srq {
> +	struct ibv_srq		ibv_srq;
> +	struct mminfo		mmap_info;
> +	struct rfc_wq		rq;
> +	uint32_t		srq_num;
> +};
> +
> +#define to_rxxx(xxx, type) container_of(ib##xxx, struct rfc_##type, ibv_##xxx)
> +
> +static inline struct rfc_context *to_rctx(struct ibv_context *ibctx)
> +{
> +	return container_of(ibctx, struct rfc_context, ibv_ctx.context);
> +}
> +
> +static inline struct rfc_device *to_rdev(struct ibv_device *ibdev)
> +{
> +	return container_of(ibdev, struct rfc_device, ibv_dev.device);
> +}
> +
> +static inline struct rfc_cq *to_rcq(struct ibv_cq *ibcq)
> +{
> +	return to_rxxx(cq, cq);
> +}
> +
> +static inline struct rfc_qp *to_rqp(struct ibv_qp *ibqp)
> +{
> +	return to_rxxx(qp, qp);
> +}
> +
> +static inline struct rfc_srq *to_rsrq(struct ibv_srq *ibsrq)
> +{
> +	return to_rxxx(srq, srq);
> +}
> +
> +static inline struct rfc_ah *to_rah(struct ibv_ah *ibah)
> +{
> +	return to_rxxx(ah, ah);
> +}
> +
> +#endif /* RXE_H */

RXE?

> diff --git a/providers/rfc/rfc_cfg.in b/providers/rfc/rfc_cfg.in
> new file mode 100755
> index 0000000..0a8583d
> --- /dev/null
> +++ b/providers/rfc/rfc_cfg.in
> @@ -0,0 +1,674 @@
> +#!/usr/bin/perl
> +
> +# * Copyright (c) 2009-2011 Mellanox Technologies Ltd. All rights reserved.
> +# * Copyright (c) 2009-2011 System Fabric Works, Inc. All rights reserved.

Don't you want to add Broadcom here?

> +# *
> +# * This software is available to you under a choice of one of two
> +# * licenses.  You may choose to be licensed under the terms of the GNU
> +# * General Public License (GPL) Version 2, available from the file
> +# * COPYING in the main directory of this source tree, or the
> +# * OpenIB.org BSD license below:
> +# *
> +# *     Redistribution and use in source and binary forms, with or
> +# *     without modification, are permitted provided that the following
> +# *     conditions are met:
> +# *
> +# *	- Redistributions of source code must retain the above
> +# *	  copyright notice, this list of conditions and the following
> +# *	  disclaimer.
> +# *
> +# *	- Redistributions in binary form must reproduce the above
> +# *	  copyright notice, this list of conditions and the following
> +# *	  disclaimer in the documentation and/or other materials
> +# *	  provided with the distribution.
> +# *
> +# * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +# * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +# * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> +# * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> +# * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> +# * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> +# * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> +# * SOFTWARE.
> +#
> +
> +use warnings;
> +use strict;
> +
> +use File::Basename;
> +use File::Path qw(make_path);
> +use Getopt::Long;
> +
> +my $help = 0;
> +my $no_persist = 0;
> +my $debug = 0;
> +my $force = 0;
> +my $linkonly = 0;
> +my $parms = "/sys/module/rdma_rxe/parameters";
> +my $modprobe_opt = "";
> +my $modprobe_checked = "0";
> +my $persistence_path = "@CMAKE_INSTALL_FULL_SHAREDSTATEDIR@/rxe";
> +my $persistence_file = "${persistence_path}/rxe";
> +my $num_persistent = 0;
> +my $sys = "/sys/module/rdma_rxe/parameters";
> +my %rxe_names;
> +my @rxe_array;
> +my %eth_names;
> +my @eth_list;
> +my %eth_driver;
> +my %link_state;
> +my %link_speed;
> +my %eth_mtu;
> +my %ipv4_addr;
> +my %rxe_mtu;
> +my @persistence_array;
> +my %persistence_hash;
> +my @mlx4_port;
> +my @mlx4_ether;
> +my @roce_list;
> +
> +# Read a file and return its contents as a string.
> +sub read_file {
> +    my $filename = shift;
> +    my $result = "";
> +
> +    if (open(FILE, $filename)) {
> +	$result = <FILE>;
> +	close FILE;
> +    }
> +    return $result;
> +}
> +
> +#get mapping between rxe and eth devices
> +sub get_names {
> +    my $i = 0;
> +    
> +    foreach my $rxe (glob("/sys/class/infiniband/rxe*")) {
> +	$rxe = basename($rxe);
> +	my $eth = read_file("/sys/class/infiniband/$rxe/parent");
> +	chomp($eth);
> +	
> +	if (($eth =~ /[\w]+[\d]/)
> +	    && ($rxe =~ /rxe[0123456789]/)) {
> +	    
> +	    # hash ethername to rxename
> +	    $rxe_names{$eth} = $rxe;
> +	    $rxe_array[$i++] = $rxe;
> +	    
> +	    # hash rxename to ethername
> +	    $eth_names{$rxe} = $eth;
> +	}
> +    }
> +}
> +
> +# get list of Mellanox RoCE ports
> +sub get_mlx4_list {
> +    my $i = 0;
> +
> +    foreach my $mlx4 (glob("/sys/class/infiniband/mlx4_*")) {
> +	$mlx4 = basename($mlx4);
> +	foreach my $port (glob("/sys/class/infiniband/$mlx4/ports/*")) {
> +	    $port = basename($port);
> +	    my $link = read_file("$port/link_layer");
> +	    chomp($link);
> +
> +	    if ($link =~ "Ethernet") {
> +		$roce_list[$i++] = "$mlx4:$port";
> +	    }
> +	}
> +    }
> +}
> +
> +#collect per device information
> +sub get_dev_info {
> +    my @list;
> +    my @fields;
> +    my @lines;
> +    my $line;
> +    my $eth;
> +    my $drv;
> +    my $np;
> +    my $i = 0;
> +    my $j = 0;
> +
> +    get_mlx4_list();
> +
> +    my @my_eth_list = ();
> +    foreach my $my_eth_dev (glob("/sys/class/net/*")) {
> +	$my_eth_dev = basename($my_eth_dev);
> +        my $my_dev_type = read_file("/sys/class/net/${my_eth_dev}/type");
> +	chomp($my_dev_type);
> +        if ($my_dev_type == "1") {
> +            push(@my_eth_list, "$my_eth_dev");
> +        }
> +    }
> +
> +    @list = @my_eth_list;
> +    foreach $eth (@list) {
> +	chomp($eth);
> +
> +	$eth_list[$i++] = $eth;
> +
> +	@lines = `ethtool -i $eth`;
> +	foreach $line (@lines) {
> +	    chomp($line);
> +
> +	    @fields = split(/\s+/, $line);
> +	    chomp($fields[0]);
> +
> +	    if ($fields[0] =~ /driver:/) {
> +		$drv = $fields[1];
> +		$eth_driver{$eth} = $drv;
> +
> +		if ($drv =~ /mlx4_en/ && scalar(@roce_list) > 0 ) {
> +		    $eth_names{$roce_list[$j++]} = $eth;
> +		}
> +	    }
> +	}
> +
> +	# get link status
> +	$link_state{$eth} = "";
> +	$link_speed{$eth} = "";
> +
> +	@lines = `ethtool $eth`;
> +	foreach $line (@lines) {
> +	    chomp($line);
> +
> +	    @fields = split(/:/, $line);
> +	    if (defined($fields[1])) {
> +		    $fields[1] =~ s/^\s+//g;
> +		    if ($fields[0] =~ "Link detected") {
> +			$link_state{$eth} = $fields[1];
> +		    }
> +	    }
> +	    elsif ($line =~ "10000baseT") {
> +		$link_speed{$eth} = "10GigE";
> +	    }
> +	}
> +
> +	$ipv4_addr{$eth} = "            ";
> +	$eth_mtu{$eth} = "";
> +
> +	@lines = `ifconfig $eth`;
> +	foreach $line (@lines) {
> +	    # get IP address
> +	    if ($line =~ /inet addr/) {
> +		$line =~ s/^\s+inet addr://g;
> +		@fields = split(/\s+/, $line);
> +		$ipv4_addr{$eth} = $fields[0];
> +	    }
> +
> +	    # get ethernet mtu
> +	    if ($line =~ /MTU:/) {
> +		$line =~ s/^.*MTU://g;
> +		@fields = split(/\s+/, $line);
> +		$eth_mtu{$eth} = $fields[0];
> +	    }
> +	}
> +    }
> +
> +    # get rxe mtu
> +    foreach my $rxe (@rxe_array) {
> +	
> +	@lines = `ibv_devinfo -d $rxe`;
> +	foreach $line (@lines) {
> +	    if ($line =~ "active_mtu") {
> +		$line =~ s/^\s+active_mtu:\s+//g;
> +		chomp($line);
> +
> +		$rxe_mtu{$rxe} = $line;
> +	    }
> +	}
> +	$rxe_mtu{$rxe} = "(?)" if (!$rxe_mtu{$rxe});
> +    }
> +}
> +
> +# return string or the string "###" if string is all whitespace
> +sub set_field {
> +    my $fld = $_[0];
> +
> +    if (defined($fld) && $fld =~ /\S/) {
> +        return $fld;
> +    } else {
> +        return "###";
> +    }
> +}
> +
> +# format status output into fixed width columns
> +sub status_print {
> +    my @fields;
> +    my $field;
> +    my @flen = ();
> +    my $num_fields = 0;
> +    my $i;
> +    my $pad;
> +    my $line;
> +
> +    # one pass to size the columns
> +    foreach $line (@_) {
> +	@fields = split(/\s+/, $line);
> +	$i = 0;
> +	foreach $field (@fields) {
> +	    if (!defined($flen[$i])) {
> +		$flen[$i] = length($field);
> +	    }
> +	    else {
> +		$flen[$i] = max($flen[$i], length($field));
> +	    }
> +	    $i++;
> +	}
> +
> +	if ($i > $num_fields) {
> +	    $num_fields = $i;
> +	}
> +    }
> +
> +    # one pass to print
> +    foreach $line (@_) {
> +	print "  ";
> +	@fields = split(/\s+/, $line);
> +	for ($i = 0; $i < $num_fields; $i++) {
> +	    if (defined($fields[$i])) {
> +	        $pad = $flen[$i] - length($fields[$i]) + 2;
> +	    }
> +	    else {
> +	        $pad = $flen[$i] + 2;
> +	    }
> +	    if (defined($fields[$i]) && ($fields[$i] ne "###")) {
> +		print "$fields[$i]";
> +	    }
> +	    else {
> +		print "   ";
> +	    }
> +	    printf("%*s", $pad, "");
> +	}
> +	print "\n";
> +    }
> +}
> +
> +# check driver load status
> +sub check_module_status {
> +    if (-e $sys) {
> +	return 0;
> +    } else {
> +	return 1;
> +    }
> +}
> +
> +# print driver load status and ethertype for rdma_rxe and rdma_rxe_net
> +sub show_module_status {
> +    print "rdma_rxe module not loaded\n" if (!(-e $sys));
> +}
> +
> +# print rxe status
> +sub do_status {
> +    my $instance = $_[0];
> +    my $ln = 0;
> +    my @outp;
> +    my $rxe;
> +    my $rmtu;
> +
> +    get_names();
> +    get_dev_info();
> +    show_module_status();
> +
> +    $outp[$ln++] = "Name\tLink\tDriver\t\tSpeed\tNMTU\tIPv4_addr\tRDEV\tRMTU";
> +
> +    foreach my $eth (@eth_list) {
> +
> +	# handle case where rxe_drivers are not loaded
> +	if (defined($rxe_names{$eth})) {
> +		$rxe = $rxe_names{$eth};
> +		$rmtu = $rxe_mtu{$rxe};
> +	}
> +	else {
> +		$rxe = "";
> +		$rmtu = "";
> +	}
> +
> +	if ((!defined($instance) 
> +	     && (($linkonly == 0) || ($link_state{$eth} =~ "yes")))
> +	    || (defined($instance) && ($rxe =~ "$instance"))) {
> +	    $outp[$ln] =  set_field("$eth");
> +	    $outp[$ln] .= "\t";
> +	    $outp[$ln] .= set_field("$link_state{$eth}");
> +	    $outp[$ln] .= "\t";
> +	    $outp[$ln] .= set_field(exists($eth_driver{$eth}) ? $eth_driver{$eth} : "");
> +	    $outp[$ln] .= "\t";
> +	    $outp[$ln] .= set_field("$link_speed{$eth}");
> +	    $outp[$ln] .= "\t";
> +	    $outp[$ln] .= set_field("$eth_mtu{$eth}");
> +	    $outp[$ln] .= "\t";
> +	    $outp[$ln] .= set_field("$ipv4_addr{$eth}");
> +	    $outp[$ln] .= "\t";
> +	    $outp[$ln] .= set_field("$rxe");
> +	    $outp[$ln] .= "\t";
> +	    $outp[$ln] .= set_field("$rmtu");
> +	    $ln++;
> +	}
> +    }
> +
> +    status_print(@outp);
> +}
> +
> +# read file containing list of ethernet devices into a list
> +sub populate_persistence {
> +    my $i = 0;
> +    
> +    open FILE, $persistence_file;
> +    while(<FILE>) {
> +	my $line = $_;
> +	chomp($line);
> +	$line =~ s/^\s+//g;
> +	if ($line =~ /[\w]+[\d]/) {
> +	    # in case we add fields later
> +	    my ($eth, $cruft) = split(/\s+/, $line, 2);
> +	    if ($eth =~ /^[\w]+[\d]/) {
> +		$persistence_array[$i] = $eth;
> +		$persistence_hash{$eth} = $i++;
> +	    }
> +	}
> +    }
> +    close FILE;
> +
> +    $num_persistent = $i;
> +}
> +
> +# print out list of ethernet devices to file
> +sub commit_persistent {
> +    my $i;
> +    my $eth;
> +
> +    open(PF, ">$persistence_file");
> +    
> +    for ($i = 0; $i < $num_persistent; $i++) {
> +	$eth = $persistence_array[$i];
> +	if ($eth =~ /[\w]+[\d]/) {
> +	    print(PF "$persistence_array[$i]\n");
> +	}
> +    }
> +
> +    close(PF);
> +}
> +
> +sub delete_persistent {
> +    my $eth = $_[0];
> +    
> +    if (defined($persistence_hash{$eth})) {
> +	$persistence_array[$persistence_hash{$eth}] = "";
> +    }
> +}
> +
> +sub add_persistent {
> +    my $eth = $_[0];
> +
> +    # Is this one already in the persistence list?
> +    if (!defined($persistence_hash{$eth})) {
> +	$persistence_array[$num_persistent] = $eth;
> +	$persistence_hash{$eth} = $num_persistent;
> +	$num_persistent++;
> +    }
> +}
> +
> +# add new rxe device to eth if not already up
> +sub rxe_add {
> +    my $eth = $_[0];
> +
> +    if (!($eth =~ /[\w]+[\d]/)) {
> +	print "eth_name ($eth) looks bogus\n";
> +	return;
> +    }
> +
> +    if (!defined($rxe_names{$eth})) {
> +	system("echo '$eth' > $parms/add");
> +    }
> +    if (!$no_persist) {
> +	add_persistent($eth);
> +	commit_persistent();
> +    }
> +}
> +
> +sub rxe_remove {
> +    my $arg2 = $_[0];
> +    my $rxe;
> +    my $eth;
> +
> +    print "remove $arg2\n"  if ($debug > 0);
> +
> +    if ($arg2 =~ /[\w]+[\d]/) {
> +	$eth = $arg2;
> +	$rxe = $rxe_names{$eth};
> +    }
> +    elsif ($arg2 =~ /rxe[0123456789]/) {
> +	$rxe = $arg2;
> +	$eth = $eth_names{$rxe};
> +    }
> +    elsif ($arg2 eq "all") {
> +	$rxe = "all";
> +    }
> +
> +    if (($rxe eq "all") || ($rxe =~ /^rxe[0123456789]/)) {
> +	my $cmd = "echo '$rxe' > $parms/remove";
> +	#print "$cmd\n";
> +	system($cmd);
> +	if (!$no_persist) {
> +	    if ($rxe eq "all") {
> +		unlink($persistence_file);
> +	    }
> +	    elsif ($eth =~/[\w]+[\d]/) {
> +		delete_persistent($eth);
> +		commit_persistent();
> +	    }
> +	    else {
> +		print "Warning: Unable to resolve ethname; "
> +		    . "instance may persist on restart\n";
> +	    }
> +	}
> +    }
> +    else {
> +	print "rxe instance $rxe not found\n";
> +    }
> +}
> +
> +sub get_devinfo {
> +    my $rxe = $_[0];
> +
> +    my $cmd = "ibv_devinfo -d $rxe";
> +    return `$cmd`;
> +}
> +
> +# allow unsupported modules to load in SLES11 if allowed
> +sub modprobe {
> +    my $module = $_[0];
> +    my $opts = $_[1];
> +    my @lines;
> +    my $line;
> +
> +    if ($modprobe_checked == "0") {
> +	@lines = `modprobe -c`;
> +	foreach $line (@lines) {
> +	    if ($line =~ /^allow_unsupported_modules  *0/) {
> +		$modprobe_opt = " --allow-unsupported-modules ";
> +		last;
> +	    }
> +	}
> +	$modprobe_checked = "1";
> +    }
> +
> +    if (!defined($opts)) {
> +	$opts = "";
> +    }
> +
> +    system("modprobe $modprobe_opt $module $opts");
> +}
> +
> +# bring up rxe
> +sub do_start {
> +    my $proto_str = "";
> +
> +    system("mkdir -p $persistence_path");
> +    system("touch $persistence_file");
> +
> +    modprobe("ib_core");
> +    modprobe("ib_uverbs");
> +    modprobe("rdma_ucm");
> +    modprobe("rdma_rxe");
> +
> +    populate_persistence();
> +
> +    foreach my $eth (@persistence_array) {
> +	rxe_add($eth);
> +    }
> +
> +    get_names();
> +
> +    foreach my $rxe (@rxe_array) {
> +	my $stat = get_devinfo($rxe);
> +	if ($stat =~ "PORT_DOWN") {
> +	    my $cmd = "ifconfig $eth_names{$rxe} up";
> +	    system($cmd);
> +	}
> +    }
> +
> +}
> +
> +# check if argument is an integer
> +sub is_integer {
> +    defined $_[0] && $_[0] =~ /^[+-]?\d+$/;
> +}
> +
> +# remove all rxe devices and unload drivers

rxe?

> +sub do_stop {
> +    my $rxe;

rxe?

> +
> +    foreach $rxe (@rxe_array) {
> +	system("echo '$rxe' > $sys/remove");

etc...

> +    }
> +
> +    if (-e $sys) {
> +	system("rmmod rdma_rxe");
> +    }
> +
> +    if (-e $sys) {
> +	print "unable to unload drivers, reboot required\n";
> +    }
> +}
> +
> +sub do_debug {
> +    my $arg2 = $_[0];
> +    my $debugfile = "$parms/debug";
> +    chomp($arg2);
> +
> +    if (!(-e "$debugfile")) {
> +	print "Error: debug is compiled out of this rxe driver\n";
> +	return;
> +    }
> +
> +    if    ($arg2 eq "on")  { system("echo '31' > $debugfile"); }
> +    elsif ($arg2 eq "off") { system("echo '0'  > $debugfile"); }
> +    elsif ($arg2 eq "0")   { system("echo '0'  > $debugfile"); }
> +    elsif ($arg2 eq "")    { }
> +	elsif ($arg2 ge "0" && $arg2 le "31") {
> +	    system("echo '$arg2' > $debugfile");
> +	}
> +	else {
> +	    print "unrecognized debug cmd ($arg2)\n";
> +	}
> +
> +    my $current = read_file($debugfile);
> +    chomp($current);
> +    if ($current > 0) {
> +	print "Debug is ON ($current)\n";
> +    }
> +    elsif ($current == 0) {
> +	print "Debug is OFF\n";
> +    }
> +    else {
> +	print "Unrecognized debug value\n";
> +    }
> +}
> +
> +sub max {
> +    my $a = $_[0];
> +    my $b = $_[1];
> +    return $a if ($a > $b);
> +    return $b;
> +}
> +
> +# show usage for rxe_cfg
> +sub usage {
> +    print "  Usage:\n";
> +    print "    rxe_cfg [options] start|stop|status|persistent\n";
> +    print "    rxe_cfg debug on|off|<num>\n";
> +    print "    rxe_cfg [-n] add <ndev>\n";
> +    print "    rxe_cfg [-n] remove <ndev>|<rdev>\n";
> +    print "\n";
> +    print "    <ndev> = network device e.g. eth3\n";
> +    print "    <rdev> = rdma device e.g. rxe1\n";
> +    print "\n";
> +    print "  Options:\n";
> +    print "    -h: print this usage information\n";
> +    print "    -n: do not make the configuration action persistent\n";
> +    print "    -v: print additional debug output\n";
> +    print "    -l: show status for interfaces with link up\n";
> +    print "    -p <num>: (start command only) - set ethertype\n";
> +}
> +
> +sub main {
> +    GetOptions(
> +	   "-h"          => \$help,
> +	   "--help"      => \$help,
> +	   "-n"          => \$no_persist,
> +	   "-v:+"        => \$debug,
> +	   "-f"          => \$force,
> +	   "-l"          => \$linkonly,
> +	   );
> +
> +    my $arg1 = $ARGV[0];
> +    my $arg2 = $ARGV[1];
> +    my $arg3 = $ARGV[2];
> +
> +    # status is the default
> +    if (!defined($arg1) || ($arg1 =~ /status/)) {
> +        do_status($arg2);
> +        exit;
> +    }
> +
> +    if ($help) {
> +        usage();
> +        exit;
> +    }
> +
> +    # stuff that does not require modules to be loaded
> +    if    ($arg1 eq "help")       { usage(); exit; }
> +    elsif ($arg1 eq "start")      { do_start(); do_status(); exit; }
> +    elsif ($arg1 eq "persistent") { system("cat $persistence_file"); exit; }
> +
> +
> +    # can't do much else, bail if modules aren't loaded
> +    if (check_module_status()) {
> +	exit;
> +    }
> +
> +    # create persistence file if necessary
> +    make_path($persistence_path);
> +    if (!(-e $persistence_file)) {
> +        `touch $persistence_file`;
> +    }
> +
> +    # Get full context of the configuration
> +    populate_persistence();
> +    get_names();
> +    get_dev_info();
> +
> +    # Stuff that requires the rdma_rxe module to be loaded
> +    if    ($arg1 eq "stop")   { do_stop(); 	   exit; }
> +    elsif ($arg1 eq "debug")  { do_debug($arg2);   exit; }
> +    elsif ($arg1 eq "add")    { rxe_add($arg2);    exit; }
> +    elsif ($arg1 eq "remove") { rxe_remove($arg2); exit; }
> +    elsif ($arg1 eq "help")   { usage();	   exit; }
> +}
> +
> +main();
> +
> +exit;
> diff --git a/providers/rfc/rfc_queue.h b/providers/rfc/rfc_queue.h
> new file mode 100644
> index 0000000..a82e223
> --- /dev/null
> +++ b/providers/rfc/rfc_queue.h
> @@ -0,0 +1,128 @@
> +/*
> + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved.
> + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved.

Add yours...

> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the fileA
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + *
> + */
> +
> +/* implements a simple circular buffer with sizes a power of 2 */
> +
> +#ifndef H_RXE_PCQ
> +#define H_RXE_PCQ

RXE?
(Will not comment more on that....probably there are more)

> +
> +#include <stdint.h>
> +#include <stdatomic.h>
> +
> +/* MUST MATCH kernel struct rfc_pqc in rfc_queue.h */
> +struct rfc_queue {
> +	uint32_t		log2_elem_size;
> +	uint32_t		index_mask;
> +	uint32_t		pad_1[30];
> +	_Atomic(uint32_t)	producer_index;
> +	uint32_t		pad_2[31];
> +	_Atomic(uint32_t)	consumer_index;
> +	uint32_t		pad_3[31];
> +	uint8_t			data[0];
> +};
> +
> +static inline int next_index(struct rfc_queue *q, int index)
> +{
> +	return (index + 1) & q->index_mask;
> +}
> +
> +static inline int queue_empty(struct rfc_queue *q)
> +{
> +	/* Must hold consumer_index lock */
> +	return ((atomic_load(&q->producer_index) -
> +		 atomic_load_explicit(&q->consumer_index,
> +				      memory_order_relaxed)) &
> +		q->index_mask) == 0;
> +}
> +
> +static inline int queue_full(struct rfc_queue *q)
> +{
> +	/* Must hold producer_index lock */
> +	return ((atomic_load_explicit(&q->producer_index,
> +				      memory_order_relaxed) +
> +		 1 - atomic_load(&q->consumer_index)) &
> +		q->index_mask) == 0;
> +}
> +
> +static inline void advance_producer(struct rfc_queue *q)
> +{
> +	/* Must hold producer_index lock */
> +	atomic_thread_fence(memory_order_release);
> +	atomic_store(
> +	    &q->producer_index,
> +	    (atomic_load_explicit(&q->producer_index, memory_order_relaxed) +
> +	     1) &
> +		q->index_mask);

Join the above two lines.

> +}
> +
> +static inline void advance_consumer(struct rfc_queue *q)
> +{
> +	/* Must hold consumer_index lock */
> +	atomic_store(
> +	    &q->consumer_index,
> +	    (atomic_load_explicit(&q->consumer_index, memory_order_relaxed) +
> +	     1) &
> +		q->index_mask);

Join the above two lines.

> +}
> +
> +static inline void *producer_addr(struct rfc_queue *q)
> +{
> +	/* Must hold producer_index lock */
> +	return q->data + ((atomic_load_explicit(&q->producer_index,
> +						memory_order_relaxed) &
> +			   q->index_mask)
> +			  << q->log2_elem_size);
> +}
> +
> +static inline void *consumer_addr(struct rfc_queue *q)
> +{
> +	/* Must hold consumer_index lock */
> +	return q->data + ((atomic_load_explicit(&q->consumer_index,
> +						memory_order_relaxed) &
> +			   q->index_mask)
> +			  << q->log2_elem_size);
> +}
> +
> +static inline void *addr_from_index(struct rfc_queue *q, unsigned int index)
> +{
> +	return q->data + ((index & q->index_mask)
> +				<< q->log2_elem_size);
> +}
> +
> +static inline unsigned int index_from_addr(const struct rfc_queue *q, const void *addr)
> +{
> +	return (((uint8_t *)addr - q->data) >> q->log2_elem_size) & q->index_mask;
> +}
> +
> +#endif /* H_RXE_PCQ */
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux