From: Jack Wang <jinpu.wang@xxxxxxxxxxxxxxxx> User module eg ibnbd_client will use this interface to transfer data later. Signed-off-by: Jack Wang <jinpu.wang@xxxxxxxxxxxxxxxx> Signed-off-by: Kleber Souza <kleber.souza@xxxxxxxxxxxxxxxx> Signed-off-by: Danil Kipnis <danil.kipnis@xxxxxxxxxxxxxxxx> Signed-off-by: Roman Pen <roman.penyaev@xxxxxxxxxxxxxxxx> --- include/rdma/ibtrs_clt.h | 316 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 316 insertions(+) create mode 100644 include/rdma/ibtrs_clt.h diff --git a/include/rdma/ibtrs_clt.h b/include/rdma/ibtrs_clt.h new file mode 100644 index 0000000..4fc9b12 --- /dev/null +++ b/include/rdma/ibtrs_clt.h @@ -0,0 +1,316 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler < mail@xxxxxxxxxx> + * Jack Wang <jinpu.wang@xxxxxxxxxxxxxxxx> + * Kleber Souza <kleber.souza@xxxxxxxxxxxxxxxx> + * Danil Kipnis <danil.kipnis@xxxxxxxxxxxxxxxx> + * Roman Pen <roman.penyaev@xxxxxxxxxxxxxxxx> + * Milind Dumbare <Milind.dumbare@xxxxxxxxx> + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + */ + +#if !defined(IBTRS_CLIENT_H) +#define IBTRS_CLIENT_H + +#include <linux/scatterlist.h> + +struct ibtrs_session; + +/** + * ibtrs_clt_open() - Open a session to a ibtrs_server + * @addr: The IPv4, IPv6 or GID address of the peer + * @pdu_sz: Size of extra payload which can be accessed after tag allocation. + * @priv: Pointer passed back on &ibtrs_clt_ops->sess_ev() invocation + * @max_inflight_msg: Max. number of parallel inflight messages for the session + * @max_segments: Max. number of segments per IO request + * @reconnect_delay_sec: time between reconnect tries + * @max_reconnect_attempts: Number of times to reconnect on error before giving + * up, 0 for * disabled, -1 for forever + * + * Starts session establishment with the ibtrs_server. The function can block + * up to ~2000ms until it returns. + * + * Return a valid pointer on success otherwise PTR_ERR. + * -EINVAL: The provided addr could not be resolved to an Infiniband + * address, the route to the host could not be resolved or + * ibtrs_clt_register() was not called before. + */ +struct ibtrs_session *ibtrs_clt_open(const struct sockaddr_storage *addr, + size_t pdu_sz, void *priv, + u8 reconnect_delay_sec, u16 max_segments, + s16 max_reconnect_attempts); + +/** + * ibtrs_clt_close() - Close a session + * @sess: Session handler, is freed on return + */ +int ibtrs_clt_close(struct ibtrs_session *sess); + +/** + * enum ibtrs_clt_rdma_ev - Events related to RDMA transfer operations + */ +enum ibtrs_clt_rdma_ev { + IBTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL, + IBTRS_CLT_RDMA_EV_RDMA_WRITE_COMPL, +}; + +/** + * enum ibtrs_sess_ev - Events about connectivity state of a session + * @IBTRS_CLT_SESS_EV_RECONNECT The session was reconnected. + * @IBTRS_CLT_SESS_EV_DISCONNECTED The session was disconnected. + * @IBTRS_CLT_SESS_EV_MAX_RECONN_EXCEEDED Reconect attempts stopped because + * max. number of reconnect attempts + * are reached. + */ +enum ibtrs_clt_sess_ev { + IBTRS_CLT_SESS_EV_RECONNECT, + IBTRS_CLT_SESS_EV_DISCONNECTED, + IBTRS_CLT_SESS_EV_MAX_RECONN_EXCEEDED, +}; + +/** + * struct ibtrs_clt_ops - Callback functions of the user module + * @owner: module that uses ibtrs_server + * @rdma_ev: Event notifications for RDMA operations, + * Context: in interrupt(soft irq). The function should be relatively fast. + * @priv: user supplied data that was passed to + * ibtrs_clt_request_rdma_write() or + * ibtrs_clt_rdma_write() before + * @ev: Occurred event + * @errno: Result of corresponding operation, + * 0 on success or negative ERRNO code on error + * @sess_ev: Event notification for connection state changes + * @priv: user supplied data that was passed to + * ibtrs_clt_open() + * @ev: Occurred event + * @errno: Result of corresponding operation, + * 0 on success or negative ERRNO code on error + * @recv: Event notification for infiniband message receival + * @priv: user supplied data that was passed to + * ibtrs_clt_open() + * @msg Received data + * @len. Size of the @msg buffer + * + * The @recv and @rdma_ev are running on the same CPU that requested the RDMA + * operation before. + */ + +typedef void (rdma_clt_ev_fn)(void *priv, enum ibtrs_clt_rdma_ev ev, int errno); +typedef void (sess_clt_ev_fn)(void *priv, enum ibtrs_clt_sess_ev ev, int errno); +typedef void (recv_clt_fn)(void *priv, const void *msg, size_t len); + +struct ibtrs_clt_ops { + struct module *owner; + rdma_clt_ev_fn *rdma_ev; + sess_clt_ev_fn *sess_ev; + recv_clt_fn *recv; +}; + +/** + * ibtrs_clt_register() - register a user module with ibtrs_client + * @ops: callback functions to register + * + * Return: + * 0: Success + * -ENOTSUPP: Registration failed, max. number of supported user modules + reached + */ +int ibtrs_clt_register(const struct ibtrs_clt_ops *ops); + +/** + * ibtrs_clt_unregister() - unregister a module at ibtrs_client + * @ops: struct that was passed before to ibtrs_clt_register() + * + * ibtrs_clt_unregister() must only be called after all session that were + * created by the user module were closed. + */ +void ibtrs_clt_unregister(const struct ibtrs_clt_ops *ops); + +enum { + IBTRS_TAG_NOWAIT = 0, + IBTRS_TAG_WAIT = 1, +}; + +/** + * ibtrs_tag - tags the memory allocation for future RDMA operation + */ +struct ibtrs_tag { + unsigned int cpu_id; + unsigned int mem_id; + unsigned int mem_id_mask; +}; + +static inline struct ibtrs_tag *ibtrs_tag_from_pdu(void *pdu) +{ + return pdu - sizeof(struct ibtrs_tag); +} + +static inline void *ibtrs_tag_to_pdu(struct ibtrs_tag *tag) +{ + return tag + 1; +} + +/** + * ibtrs_get_tag() - allocates tag for future RDMA operation + * @sess: Current session + * @cpu_id: cpu_id to run + * @nr_bytes: Number of bytes to consume per tag + * @wait: Wait type + * + * Description: + * Allocates tag for the following RDMA operation. Tag is used + * to preallocate all resources and to propagate memory pressure + * up earlier. + * + * Context: + * Can sleep if @wait == IBTRS_TAG_WAIT + */ +struct ibtrs_tag *ibtrs_get_tag(struct ibtrs_session *sess, int cpu_id, + size_t nr_bytes, int wait); + +/** + * ibtrs_put_tag() - puts allocated tag + * @sess: Current session + * @tag: Tag to be freed + * + * Context: + * Does not matter + */ +void ibtrs_put_tag(struct ibtrs_session *sess, struct ibtrs_tag *tag); + +/** + * ibtrs_clt_rdma_write() - Transfer data to the server via RDMA. + * @sess: Session + * @tag: Preallocated tag + * @priv: User provided data, passed back on corresponding + * @ibtrs_clt_ops->rdma_ev() event + * @vec: User module message to transfer together with @sg. + * Sum of len of all @vec elements limited to <= IO_MSG_SIZE + * @nr: Number of elements in @vec. + * @data_len: Size of data in @sg + * @sg: data to transferred, 512B aligned in the receivers memory + * @sg_len: number of elements in @sg array + * + * Return: + * 0: Success + * <0: Error + * + * On completion of the operation a %IBTRS_CLT_RDMA_EV_RDMA_WRITE_COMPL is + * generated. If an error happened on IBTRS layer for this operation a + * %IBTRS_CLT_RDMA_EV_ERROR is generated. + */ +int ibtrs_clt_rdma_write(struct ibtrs_session *sess, struct ibtrs_tag *tag, + void *priv, const struct kvec *vec, size_t nr, + size_t data_len, struct scatterlist *sg, + unsigned int sg_len); + +/** + * ibtrs_clt_request_rdma_write() - Request data transfer from server via RDMA. + * + * @sess: Session + * @tag: Preallocated tag + * @priv: User provided data, passed back on corresponding + * @ibtrs_clt_ops->rdma_ev() event + * @vec: Message that is send to server together with the request. + * Sum of len of all @vec elements limited to <= IO_MSG_SIZE. + * @nr: Number of elements in @vec. + * @result_len: Max. length of data that ibtrs_server will send back + * @recv_sg: Pages in which the response of the server will be stored. + * @recv_sg_len: Number of elements in the @recv_sg + * + * Return: + * 0: Success + * <0: Error + * + * IBTRS Client will request a data transfer from Server to Client via RDMA. + * The data that the server will respond with will be stored in @recv_sg when + * the user receives an %IBTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL event. + * If an error occurred on the IBTRS layer a %IBTRS_CLT_RDMA_EV_ERROR is + * generated instead + */ +int ibtrs_clt_request_rdma_write(struct ibtrs_session *sess, + struct ibtrs_tag *tag, void *priv, + const struct kvec *vec, size_t nr, + size_t result_len, + struct scatterlist *recv_sg, + unsigned int recv_sg_len); + +/** + * ibtrs_clt_send() - Send data to server via an infiniband message. + * @sess: Session + * @vec: Data to transfer + * @nr: Number of elements in @vec + * + * Return: + * 0: Success + * <0: Error: + * -ECOMM no connection to the server + * -EINVAL message size too big (500 bytes max) + * -EAGAIN run out of tx buffers - try again later + * -<IB ERROR> see mlx doc + * + * The operation is not confirmed. It is the responsibility of the user on the + * other side to send an acknowledgment if required. + */ +int ibtrs_clt_send(struct ibtrs_session *sess, const struct kvec *vec, + size_t nr); + +/** + * ibtrs_attrs - IBTRS session attributes + */ +struct ibtrs_attrs { + u32 queue_depth; + u64 mr_page_mask; + u32 mr_page_size; + u32 mr_max_size; + u32 max_pages_per_mr; + u32 max_sge; + u32 max_io_size; + u8 hostname[MAXHOSTNAMELEN]; +}; + +/** + * ibtrs_clt_query() - queries IBTRS session attributes + * + * Returns: + * 0 on success + * -ECOMM no connection to the server + */ +int ibtrs_clt_query(struct ibtrs_session *sess, struct ibtrs_attrs *attr); + +#endif -- 2.7.4