On 12/30/19 2:29 AM, Jack Wang wrote:
+MODULE_DESCRIPTION("InfiniBand Network Block Device Client");
InfiniBand or RDMA?
+static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, + const struct rnbd_msg_open_rsp *rsp) +{ + struct rnbd_clt_session *sess = dev->sess; + + if (unlikely(!rsp->logical_block_size)) + return -EINVAL; + + dev->device_id = le32_to_cpu(rsp->device_id); + dev->nsectors = le64_to_cpu(rsp->nsectors); + dev->logical_block_size = le16_to_cpu(rsp->logical_block_size); + dev->physical_block_size = le16_to_cpu(rsp->physical_block_size); + dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors); + dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors); + dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); + dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); + dev->secure_discard = le16_to_cpu(rsp->secure_discard); + dev->rotational = rsp->rotational; + + dev->max_hw_sectors = sess->max_io_size / dev->logical_block_size;
The above statement looks suspicious to me. The unit of the second argument of blk_queue_max_hw_sectors() is 512 bytes. Since dev->max_hw_sectors is passed as the second argument to blk_queue_max_hw_sectors() I think it should also have 512 bytes as unit instead of the logical block size.
+static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, + size_t new_nsectors) +{ + int err = 0; + + rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", + dev->nsectors, new_nsectors); + dev->nsectors = new_nsectors; + set_capacity(dev->gd, + dev->nsectors * (dev->logical_block_size / + SECTOR_SIZE)); + err = revalidate_disk(dev->gd); + if (err) + rnbd_clt_err(dev, + "Failed to change device size from %zu to %zu, err: %d\n", + dev->nsectors, new_nsectors, err); + return err; +}
Please document the unit of nsectors in struct rnbd_clt_dev. Please also document the unit of the 'new_nsectors' argument.
The set_capacity() call can only be correct if the unit of dev->nsectors is one logical block. Is that really the case?
+static void msg_io_conf(void *priv, int errno) +{ + struct rnbd_iu *iu = priv; + struct rnbd_clt_dev *dev = iu->dev; + struct request *rq = iu->rq; + + iu->status = errno ? BLK_STS_IOERR : BLK_STS_OK; + + blk_mq_complete_request(rq); + + if (errno) + rnbd_clt_info_rl(dev, "%s I/O failed with err: %d\n", + rq_data_dir(rq) == READ ? "read" : "write", + errno); +}
Accessing 'rq' after having called blk_mq_complete_request() may trigger a use-after-free. Please don't do that.
+static void wait_for_rtrs_disconnection(struct rnbd_clt_session *sess) +__releases(&sess_lock) +__acquires(&sess_lock)
Please indent __releases() and __acquires() annotations.
+{ + DEFINE_WAIT_FUNC(wait, autoremove_wake_function); + + prepare_to_wait(&sess->rtrs_waitq, &wait, TASK_UNINTERRUPTIBLE); + if (IS_ERR_OR_NULL(sess->rtrs)) { + finish_wait(&sess->rtrs_waitq, &wait); + return; + } + mutex_unlock(&sess_lock); + /* After unlock session can be freed, so careful */ + schedule(); + mutex_lock(&sess_lock); +}
How can a function that calls schedule() and that is not surrounded by a loop be correct? What if e.g. schedule() finishes due to a spurious wakeup?
+static struct rnbd_clt_session *__find_and_get_sess(const char *sessname) +__releases(&sess_lock) +__acquires(&sess_lock) +{ + struct rnbd_clt_session *sess; + int err; + +again: + list_for_each_entry(sess, &sess_list, list) { + if (strcmp(sessname, sess->sessname)) + continue; + + if (unlikely(sess->rtrs_ready && IS_ERR_OR_NULL(sess->rtrs))) + /* + * No RTRS connection, session is dying. + */ + continue; + + if (likely(rnbd_clt_get_sess(sess))) { + /* + * Alive session is found, wait for RTRS connection. + */ + mutex_unlock(&sess_lock); + err = wait_for_rtrs_connection(sess); + if (unlikely(err)) + rnbd_clt_put_sess(sess); + mutex_lock(&sess_lock); + + if (unlikely(err)) + /* Session is dying, repeat the loop */ + goto again; + + return sess; + } + /* + * Ref is 0, session is dying, wait for RTRS disconnect + * in order to avoid session names clashes. + */ + wait_for_rtrs_disconnection(sess); + /* + * RTRS is disconnected and soon session will be freed, + * so repeat a loop. + */ + goto again; + } + + return NULL; +}
Since wait_for_rtrs_disconnection() unlocks sess_lock, can the list_for_each_entry() above trigger a use-after-free of sess->next?
+static size_t rnbd_clt_get_sg_size(struct scatterlist *sglist, u32 len) +{ + struct scatterlist *sg; + size_t tsize = 0; + int i; + + for_each_sg(sglist, sg, len, i) + tsize += sg->length; + return tsize; +}
Please follow the example of other block drivers and use blk_rq_bytes() instead of iterating over the sg-list.
+static int setup_mq_tags(struct rnbd_clt_session *sess) +{ + struct blk_mq_tag_set *tags = &sess->tag_set; + + memset(tags, 0, sizeof(*tags)); + tags->ops = &rnbd_mq_ops; + tags->queue_depth = sess->queue_depth; + tags->numa_node = NUMA_NO_NODE; + tags->flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_TAG_SHARED; + tags->cmd_size = sizeof(struct rnbd_iu); + tags->nr_hw_queues = num_online_cpus(); + + return blk_mq_alloc_tag_set(tags); +}
Please change the name of the "tags" pointer into "tag_set".
+static int index_to_minor(int index) +{ + return index << RNBD_PART_BITS; +} + +static int minor_to_index(int minor) +{ + return minor >> RNBD_PART_BITS; +}
Is it useful to introduce functions that encapsulate a single shift operation?
+ blk_queue_virt_boundary(dev->queue, 4095);
The virt_boundary parameter must match the RDMA memory registration page size. Please introduce a symbolic constant for the RDMA memory registration page size such that these two parameters stay in sync in case anyone would want to change the memory registration page size.
+static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) +{ + dev->gd->major = rnbd_client_major; + dev->gd->first_minor = index_to_minor(idx); + dev->gd->fops = &rnbd_client_ops; + dev->gd->queue = dev->queue; + dev->gd->private_data = dev; + snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d", + idx); + pr_debug("disk_name=%s, capacity=%zu\n", + dev->gd->disk_name, + dev->nsectors * (dev->logical_block_size / SECTOR_SIZE) + ); + + set_capacity(dev->gd, dev->nsectors * (dev->logical_block_size / + SECTOR_SIZE));
Again, what is the unit of dev->nsectors?
+static void rnbd_clt_add_gen_disk(struct rnbd_clt_dev *dev) +{ + add_disk(dev->gd); +}
Is it useful to introduce this wrapper around add_disk()? Thanks, Bart.