Hi Roman, Some comments below. On 02/02/2018 04:08 PM, Roman Pen wrote:
This is main functionality of ibtrs-server module, which accepts set of RDMA connections (so called IBTRS session), creates/destroys sysfs entries associated with IBTRS session and notifies upper layer (user of IBTRS API) about RDMA requests or link events. Signed-off-by: Roman Pen <roman.penyaev@xxxxxxxxxxxxxxxx> Signed-off-by: Danil Kipnis <danil.kipnis@xxxxxxxxxxxxxxxx> Cc: Jack Wang <jinpu.wang@xxxxxxxxxxxxxxxx> --- drivers/infiniband/ulp/ibtrs/ibtrs-srv.c | 1811 ++++++++++++++++++++++++++++++ 1 file changed, 1811 insertions(+) diff --git a/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c new file mode 100644 index 000000000000..0d1fc08bd821 --- /dev/null +++ b/drivers/infiniband/ulp/ibtrs/ibtrs-srv.c @@ -0,0 +1,1811 @@ +/* + * InfiniBand Transport Layer + * + * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved. + * Authors: Fabian Holler <mail@xxxxxxxxxx> + * Jack Wang <jinpu.wang@xxxxxxxxxxxxxxxx> + * Kleber Souza <kleber.souza@xxxxxxxxxxxxxxxx> + * Danil Kipnis <danil.kipnis@xxxxxxxxxxxxxxxx> + * Roman Penyaev <roman.penyaev@xxxxxxxxxxxxxxxx> + * Milind Dumbare <Milind.dumbare@xxxxxxxxx> + * + * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved. + * Authors: Danil Kipnis <danil.kipnis@xxxxxxxxxxxxxxxx> + * Roman Penyaev <roman.penyaev@xxxxxxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include <linux/module.h> +#include <linux/mempool.h> + +#include "ibtrs-srv.h" +#include "ibtrs-log.h" + +MODULE_AUTHOR("ibnbd@xxxxxxxxxxxxxxxx"); +MODULE_DESCRIPTION("IBTRS Server"); +MODULE_VERSION(IBTRS_VER_STRING); +MODULE_LICENSE("GPL"); + +#define DEFAULT_MAX_IO_SIZE_KB 128 +#define DEFAULT_MAX_IO_SIZE (DEFAULT_MAX_IO_SIZE_KB * 1024) +#define MAX_REQ_SIZE PAGE_SIZE +#define MAX_SG_COUNT ((MAX_REQ_SIZE - sizeof(struct ibtrs_msg_rdma_read)) \ + / sizeof(struct ibtrs_sg_desc)) + +static int max_io_size = DEFAULT_MAX_IO_SIZE; +static int rcv_buf_size = DEFAULT_MAX_IO_SIZE + MAX_REQ_SIZE; + +static int max_io_size_set(const char *val, const struct kernel_param *kp) +{ + int err, ival; + + err = kstrtoint(val, 0, &ival); + if (err) + return err; + + if (ival < 4096 || ival + MAX_REQ_SIZE > (4096 * 1024) || + (ival + MAX_REQ_SIZE) % 512 != 0) { + pr_err("Invalid max io size value %d, has to be" + " > %d, < %d\n", ival, 4096, 4194304); + return -EINVAL; + } + + max_io_size = ival; + rcv_buf_size = max_io_size + MAX_REQ_SIZE; + pr_info("max io size changed to %d\n", ival); + + return 0; +} + +static const struct kernel_param_ops max_io_size_ops = { + .set = max_io_size_set, + .get = param_get_int, +}; +module_param_cb(max_io_size, &max_io_size_ops, &max_io_size, 0444); +MODULE_PARM_DESC(max_io_size, + "Max size for each IO request, when change the unit is in byte" + " (default: " __stringify(DEFAULT_MAX_IO_SIZE_KB) "KB)"); + +#define DEFAULT_SESS_QUEUE_DEPTH 512 +static int sess_queue_depth = DEFAULT_SESS_QUEUE_DEPTH; +module_param_named(sess_queue_depth, sess_queue_depth, int, 0444); +MODULE_PARM_DESC(sess_queue_depth, + "Number of buffers for pending I/O requests to allocate" + " per session. Maximum: " __stringify(MAX_SESS_QUEUE_DEPTH) + " (default: " __stringify(DEFAULT_SESS_QUEUE_DEPTH) ")"); + +/* We guarantee to serve 10 paths at least */ +#define CHUNK_POOL_SIZE (DEFAULT_SESS_QUEUE_DEPTH * 10) +static mempool_t *chunk_pool; + +static int retry_count = 7; + +static int retry_count_set(const char *val, const struct kernel_param *kp) +{ + int err, ival; + + err = kstrtoint(val, 0, &ival); + if (err) + return err; + + if (ival < MIN_RTR_CNT || ival > MAX_RTR_CNT) { + pr_err("Invalid retry count value %d, has to be" + " > %d, < %d\n", ival, MIN_RTR_CNT, MAX_RTR_CNT); + return -EINVAL; + } + + retry_count = ival; + pr_info("QP retry count changed to %d\n", ival); + + return 0; +} + +static const struct kernel_param_ops retry_count_ops = { + .set = retry_count_set, + .get = param_get_int, +}; +module_param_cb(retry_count, &retry_count_ops, &retry_count, 0644); + +MODULE_PARM_DESC(retry_count, "Number of times to send the message if the" + " remote side didn't respond with Ack or Nack (default: 3," + " min: " __stringify(MIN_RTR_CNT) ", max: " + __stringify(MAX_RTR_CNT) ")"); + +static char cq_affinity_list[256] = ""; +static cpumask_t cq_affinity_mask = { CPU_BITS_ALL }; + +static void init_cq_affinity(void) +{ + sprintf(cq_affinity_list, "0-%d", nr_cpu_ids - 1); +} + +static int cq_affinity_list_set(const char *val, const struct kernel_param *kp) +{ + int ret = 0, len = strlen(val); + cpumask_var_t new_value; + + if (!strlen(cq_affinity_list)) + init_cq_affinity(); + + if (len >= sizeof(cq_affinity_list)) + return -EINVAL; + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + ret = cpulist_parse(val, new_value); + if (ret) { + pr_err("Can't set cq_affinity_list \"%s\": %d\n", val, + ret); + goto free_cpumask; + } + + strlcpy(cq_affinity_list, val, sizeof(cq_affinity_list)); + *strchrnul(cq_affinity_list, '\n') = '\0'; + cpumask_copy(&cq_affinity_mask, new_value); + + pr_info("cq_affinity_list changed to %*pbl\n", + cpumask_pr_args(&cq_affinity_mask)); +free_cpumask: + free_cpumask_var(new_value); + return ret; +} + +static struct kparam_string cq_affinity_list_kparam_str = { + .maxlen = sizeof(cq_affinity_list), + .string = cq_affinity_list +}; + +static const struct kernel_param_ops cq_affinity_list_ops = { + .set = cq_affinity_list_set, + .get = param_get_string, +}; + +module_param_cb(cq_affinity_list, &cq_affinity_list_ops, + &cq_affinity_list_kparam_str, 0644); +MODULE_PARM_DESC(cq_affinity_list, "Sets the list of cpus to use as cq vectors." + "(default: use all possible CPUs)"); +
Can you explain why not using configfs?
+static void ibtrs_srv_close_work(struct work_struct *work) +{ + struct ibtrs_srv_sess *sess; + struct ibtrs_srv_ctx *ctx; + struct ibtrs_srv_con *con; + int i; + + sess = container_of(work, typeof(*sess), close_work); + ctx = sess->srv->ctx; + + ibtrs_srv_destroy_sess_files(sess); + ibtrs_srv_stop_hb(sess); + + for (i = 0; i < sess->s.con_num; i++) { + con = to_srv_con(sess->s.con[i]); + if (!con) + continue; + + rdma_disconnect(con->c.cm_id); + ib_drain_qp(con->c.qp); + } + /* Wait for all inflights */ + ibtrs_srv_wait_ops_ids(sess); + + /* Notify upper layer if we are the last path */ + ibtrs_srv_sess_down(sess); + + unmap_cont_bufs(sess); + ibtrs_srv_free_ops_ids(sess); + + for (i = 0; i < sess->s.con_num; i++) { + con = to_srv_con(sess->s.con[i]); + if (!con) + continue; + + ibtrs_cq_qp_destroy(&con->c); + rdma_destroy_id(con->c.cm_id); + kfree(con); + } + ibtrs_ib_dev_put(sess->s.ib_dev); + + del_path_from_srv(sess); + put_srv(sess->srv); + sess->srv = NULL; + ibtrs_srv_change_state(sess, IBTRS_SRV_CLOSED); + + kfree(sess->rdma_addr); + kfree(sess->s.con); + kfree(sess); +} + +static int ibtrs_rdma_do_accept(struct ibtrs_srv_sess *sess, + struct rdma_cm_id *cm_id) +{ + struct ibtrs_srv *srv = sess->srv; + struct ibtrs_msg_conn_rsp msg; + struct rdma_conn_param param; + int err; + + memset(¶m, 0, sizeof(param)); + param.retry_count = retry_count; + param.rnr_retry_count = 7; + param.private_data = &msg; + param.private_data_len = sizeof(msg); + + memset(&msg, 0, sizeof(msg)); + msg.magic = cpu_to_le16(IBTRS_MAGIC); + msg.version = cpu_to_le16(IBTRS_VERSION); + msg.errno = 0; + msg.queue_depth = cpu_to_le16(srv->queue_depth); + msg.rkey = cpu_to_le32(sess->s.ib_dev->rkey);
As said, this cannot happen anymore...
+static struct rdma_cm_id *ibtrs_srv_cm_init(struct ibtrs_srv_ctx *ctx, + struct sockaddr *addr, + enum rdma_port_space ps) +{ + struct rdma_cm_id *cm_id; + int ret; + + cm_id = rdma_create_id(&init_net, ibtrs_srv_rdma_cm_handler, + ctx, ps, IB_QPT_RC); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + pr_err("Creating id for RDMA connection failed, err: %d\n", + ret); + goto err_out; + } + ret = rdma_bind_addr(cm_id, addr); + if (ret) { + pr_err("Binding RDMA address failed, err: %d\n", ret); + goto err_cm; + } + ret = rdma_listen(cm_id, 64); + if (ret) { + pr_err("Listening on RDMA connection failed, err: %d\n", + ret); + goto err_cm; + } + + switch (addr->sa_family) { + case AF_INET: + pr_debug("listening on port %u\n", + ntohs(((struct sockaddr_in *)addr)->sin_port)); + break; + case AF_INET6: + pr_debug("listening on port %u\n", + ntohs(((struct sockaddr_in6 *)addr)->sin6_port)); + break; + case AF_IB: + pr_debug("listening on service id 0x%016llx\n", + be64_to_cpu(rdma_get_service_id(cm_id, addr))); + break; + default: + pr_debug("listening on address family %u\n", addr->sa_family); + }
We already have printk that accepts address format...
+ + return cm_id; + +err_cm: + rdma_destroy_id(cm_id); +err_out: + + return ERR_PTR(ret); +} + +static int ibtrs_srv_rdma_init(struct ibtrs_srv_ctx *ctx, unsigned int port) +{ + struct sockaddr_in6 sin = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + struct sockaddr_ib sib = { + .sib_family = AF_IB, + .sib_addr.sib_subnet_prefix = 0ULL, + .sib_addr.sib_interface_id = 0ULL, + .sib_sid = cpu_to_be64(RDMA_IB_IP_PS_IB | port), + .sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL), + .sib_pkey = cpu_to_be16(0xffff), + };
ipv4?
+ struct rdma_cm_id *cm_ip, *cm_ib; + int ret; + + /* + * We accept both IPoIB and IB connections, so we need to keep + * two cm id's, one for each socket type and port space. + * If the cm initialization of one of the id's fails, we abort + * everything. + */ + cm_ip = ibtrs_srv_cm_init(ctx, (struct sockaddr *)&sin, RDMA_PS_TCP); + if (unlikely(IS_ERR(cm_ip))) + return PTR_ERR(cm_ip); + + cm_ib = ibtrs_srv_cm_init(ctx, (struct sockaddr *)&sib, RDMA_PS_IB); + if (unlikely(IS_ERR(cm_ib))) { + ret = PTR_ERR(cm_ib); + goto free_cm_ip; + } + + ctx->cm_id_ip = cm_ip; + ctx->cm_id_ib = cm_ib; + + return 0; + +free_cm_ip: + rdma_destroy_id(cm_ip); + + return ret; +} + +static struct ibtrs_srv_ctx *alloc_srv_ctx(rdma_ev_fn *rdma_ev, + link_ev_fn *link_ev) +{ + struct ibtrs_srv_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->rdma_ev = rdma_ev; + ctx->link_ev = link_ev; + mutex_init(&ctx->srv_mutex); + INIT_LIST_HEAD(&ctx->srv_list); + + return ctx; +} + +static void free_srv_ctx(struct ibtrs_srv_ctx *ctx) +{ + WARN_ON(!list_empty(&ctx->srv_list)); + kfree(ctx); +} + +struct ibtrs_srv_ctx *ibtrs_srv_open(rdma_ev_fn *rdma_ev, link_ev_fn *link_ev, + unsigned int port) +{ + struct ibtrs_srv_ctx *ctx; + int err; + + ctx = alloc_srv_ctx(rdma_ev, link_ev); + if (unlikely(!ctx)) + return ERR_PTR(-ENOMEM); + + err = ibtrs_srv_rdma_init(ctx, port); + if (unlikely(err)) { + free_srv_ctx(ctx); + return ERR_PTR(err); + } + /* Do not let module be unloaded if server context is alive */ + __module_get(THIS_MODULE); + + return ctx; +} +EXPORT_SYMBOL(ibtrs_srv_open); + +void ibtrs_srv_queue_close(struct ibtrs_srv_sess *sess) +{ + close_sess(sess); +} + +static void close_sess(struct ibtrs_srv_sess *sess) +{ + enum ibtrs_srv_state old_state; + + if (ibtrs_srv_change_state_get_old(sess, IBTRS_SRV_CLOSING, + &old_state)) + queue_work(ibtrs_wq, &sess->close_work); + WARN_ON(sess->state != IBTRS_SRV_CLOSING); +} + +static void close_sessions(struct ibtrs_srv *srv) +{ + struct ibtrs_srv_sess *sess; + + mutex_lock(&srv->paths_mutex); + list_for_each_entry(sess, &srv->paths_list, s.entry) + close_sess(sess); + mutex_unlock(&srv->paths_mutex); +} + +static void close_ctx(struct ibtrs_srv_ctx *ctx) +{ + struct ibtrs_srv *srv; + + mutex_lock(&ctx->srv_mutex); + list_for_each_entry(srv, &ctx->srv_list, ctx_list) + close_sessions(srv); + mutex_unlock(&ctx->srv_mutex); + flush_workqueue(ibtrs_wq); +} + +void ibtrs_srv_close(struct ibtrs_srv_ctx *ctx) +{ + rdma_destroy_id(ctx->cm_id_ip); + rdma_destroy_id(ctx->cm_id_ib); + close_ctx(ctx); + free_srv_ctx(ctx); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(ibtrs_srv_close); + +static int check_module_params(void) +{ + if (sess_queue_depth < 1 || sess_queue_depth > MAX_SESS_QUEUE_DEPTH) { + pr_err("Invalid sess_queue_depth parameter value\n"); + return -EINVAL; + } + + /* check if IB immediate data size is enough to hold the mem_id and the + * offset inside the memory chunk + */ + if (ilog2(sess_queue_depth - 1) + ilog2(rcv_buf_size - 1) > + MAX_IMM_PAYL_BITS) { + pr_err("RDMA immediate size (%db) not enough to encode " + "%d buffers of size %dB. Reduce 'sess_queue_depth' " + "or 'max_io_size' parameters.\n", MAX_IMM_PAYL_BITS, + sess_queue_depth, rcv_buf_size); + return -EINVAL; + } + + return 0; +} + +static int __init ibtrs_server_init(void) +{ + int err; + + if (!strlen(cq_affinity_list)) + init_cq_affinity(); + + pr_info("Loading module %s, version: %s " + "(retry_count: %d, cq_affinity_list: %s, " + "max_io_size: %d, sess_queue_depth: %d)\n", + KBUILD_MODNAME, IBTRS_VER_STRING, retry_count, + cq_affinity_list, max_io_size, sess_queue_depth); + + err = check_module_params(); + if (err) { + pr_err("Failed to load module, invalid module parameters," + " err: %d\n", err); + return err; + } + chunk_pool = mempool_create_page_pool(CHUNK_POOL_SIZE, + get_order(rcv_buf_size)); + if (unlikely(!chunk_pool)) { + pr_err("Failed preallocate pool of chunks\n"); + return -ENOMEM; + } + ibtrs_wq = alloc_workqueue("ibtrs_server_wq", WQ_MEM_RECLAIM, 0); + if (!ibtrs_wq) { + pr_err("Failed to load module, alloc ibtrs_server_wq failed\n"); + goto out_chunk_pool; + } + err = ibtrs_srv_create_sysfs_module_files(); + if (err) { + pr_err("Failed to load module, can't create sysfs files," + " err: %d\n", err); + goto out_ibtrs_wq; + } + + return 0; + +out_ibtrs_wq: + destroy_workqueue(ibtrs_wq); +out_chunk_pool: + mempool_destroy(chunk_pool); + + return err; +} + +static void __exit ibtrs_server_exit(void) +{ + ibtrs_srv_destroy_sysfs_module_files(); + destroy_workqueue(ibtrs_wq); + mempool_destroy(chunk_pool); +} + +module_init(ibtrs_server_init); +module_exit(ibtrs_server_exit);