The linux block layer requires bios/requests to have lengths with a 512 byte alignment. Some drivers/layers like dm-crypt and the directi IO code will test for it and just fail. Other drivers like SCSI just assume the requirement is met and will end up in infinte retry loops. The problem for drivers like SCSI is that it uses functions like blk_rq_cur_sectors and blk_rq_sectors which divide the request's length by 512. If there's lefovers then it just gets dropped. But other code in the block/scsi layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is still data left and try to retry the cmd. We can then end up getting stuck in retry loops where part of the block/scsi thinks there is data left, but other parts think we want to do IOs of zero length. Linux will always check for alignment, but windows will not. When vhost-scsi then translates the iovec it gets from a windows guest to a scatterlist, we can end up with sg items where the sg->length is not divisible by 512 due to the misaligned offset: sg[0].offset = 255; sg[0].length = 3841; sg... sg[N].offset = 0; sg[N].length = 255; When the lio backends then convert the SG to bios or other iovecs, we end up sending them with the same misaligned values and can hit the issues above. This just has us drop down to allocating a temp page and copying the data when this happens. Because this adds a check that needs to loop over the iovec in the main IO path, this patch adds an attribute that can be turned on for just windows. Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx> --- drivers/vhost/scsi.c | 174 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 151 insertions(+), 23 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index bb10fa4bb4f6..dbad8fb579eb 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -25,6 +25,8 @@ #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/miscdevice.h> +#include <linux/blk_types.h> +#include <linux/bio.h> #include <asm/unaligned.h> #include <scsi/scsi_common.h> #include <scsi/scsi_proto.h> @@ -75,6 +77,9 @@ struct vhost_scsi_cmd { u32 tvc_prot_sgl_count; /* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */ u32 tvc_lun; + u32 copied_iov:1; + const void *saved_iter_addr; + struct iov_iter saved_iter; /* Pointer to the SGL formatted memory from virtio-scsi */ struct scatterlist *tvc_sgl; struct scatterlist *tvc_prot_sgl; @@ -107,6 +112,7 @@ struct vhost_scsi_nexus { struct vhost_scsi_tpg { /* Vhost port target portal group tag for TCM */ u16 tport_tpgt; + bool check_io_alignment; /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ int tv_tpg_port_count; /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ @@ -328,8 +334,13 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) int i; if (tv_cmd->tvc_sgl_count) { - for (i = 0; i < tv_cmd->tvc_sgl_count; i++) - put_page(sg_page(&tv_cmd->tvc_sgl[i])); + for (i = 0; i < tv_cmd->tvc_sgl_count; i++) { + if (tv_cmd->copied_iov) + __free_page(sg_page(&tv_cmd->tvc_sgl[i])); + else + put_page(sg_page(&tv_cmd->tvc_sgl[i])); + } + kfree(tv_cmd->saved_iter_addr); } if (tv_cmd->tvc_prot_sgl_count) { for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++) @@ -502,6 +513,27 @@ static void vhost_scsi_evt_work(struct vhost_work *work) mutex_unlock(&vq->mutex); } +static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd) +{ + struct iov_iter *iter = &cmd->saved_iter; + struct scatterlist *sg = cmd->tvc_sgl; + struct page *page; + size_t len; + int i; + + for (i = 0; i < cmd->tvc_sgl_count; i++) { + page = sg_page(&sg[i]); + len = sg[i].length; + + if (copy_page_to_iter(page, 0, len, iter) != len) { + pr_err("Could not copy data. Error %lu\n", len); + return -1; + } + } + + return 0; +} + /* Fill in status and signal that we are done processing this command * * This is scheduled in the vhost work queue so we are called with the owner @@ -525,15 +557,20 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__, cmd, se_cmd->residual_count, se_cmd->scsi_status); - memset(&v_rsp, 0, sizeof(v_rsp)); - v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count); - /* TODO is status_qualifier field needed? */ - v_rsp.status = se_cmd->scsi_status; - v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq, - se_cmd->scsi_sense_length); - memcpy(v_rsp.sense, cmd->tvc_sense_buf, - se_cmd->scsi_sense_length); + + if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) { + v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + } else { + v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, + se_cmd->residual_count); + /* TODO is status_qualifier field needed? */ + v_rsp.status = se_cmd->scsi_status; + v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq, + se_cmd->scsi_sense_length); + memcpy(v_rsp.sense, cmd->tvc_sense_buf, + se_cmd->scsi_sense_length); + } iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov, cmd->tvc_in_iovs, sizeof(v_rsp)); @@ -682,7 +719,52 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write, } static int -vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, +vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, + struct scatterlist *sg, int sg_count) +{ + size_t len = iov_iter_count(iter); + unsigned int nbytes = 0; + struct page *page; + int i; + + if (cmd->tvc_data_direction == DMA_FROM_DEVICE) { + cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter, + GFP_KERNEL); + if (!cmd->saved_iter_addr) + return -ENOMEM; + } + + for (i = 0; i < sg_count; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + i--; + goto err; + } + + nbytes = min_t(unsigned int, PAGE_SIZE, len); + sg_set_page(&sg[i], page, nbytes, 0); + + if (cmd->tvc_data_direction == DMA_TO_DEVICE && + copy_page_from_iter(page, 0, nbytes, iter) != nbytes) + goto err; + + len -= nbytes; + } + + cmd->copied_iov = 1; + return 0; + +err: + pr_err("Could not read %u bytes\n", nbytes); + + for (; i >= 0; i--) + __free_page(sg_page(&sg[i])); + kfree(cmd->saved_iter_addr); + return -ENOMEM; +} + +static int +vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd, size_t prot_bytes, struct iov_iter *prot_iter, size_t data_bytes, struct iov_iter *data_iter) { @@ -703,10 +785,8 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter, cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); - if (ret < 0) { - cmd->tvc_prot_sgl_count = 0; - return ret; - } + if (ret < 0) + goto map_fail; } sgl_count = vhost_scsi_calc_sgls(data_iter, data_bytes, VHOST_SCSI_PREALLOC_SGLS); @@ -717,14 +797,32 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, cmd->tvc_sgl_count = sgl_count; pr_debug("%s data_sg %p data_sgl_count %u\n", __func__, cmd->tvc_sgl, cmd->tvc_sgl_count); + /* + * The block layer requires bios/requests to be a multiple of 512 bytes, + * but Windows can send us vecs that are misaligned. This can result + * in bios and later requests with misaligned sizes if we have to break + * up a cmd into multiple bios. + * + * We currently only break up a command into multiple bios if we hit + * the vec/seg limit, so check if our sgl_count is greater than the max + * and if a vec in the cmd has a misaligned size. + */ + if (tpg->check_io_alignment && + (!iov_iter_is_aligned(data_iter, 0, SECTOR_SIZE - 1) && + bio_max_segs(sgl_count) != sgl_count)) + ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + cmd->tvc_sgl_count); + else + ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter, + cmd->tvc_sgl, cmd->tvc_sgl_count); + if (ret) + goto map_fail; - ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter, - cmd->tvc_sgl, cmd->tvc_sgl_count); - if (ret < 0) { - cmd->tvc_sgl_count = 0; - return ret; - } return 0; + +map_fail: + cmd->tvc_sgl_count = 0; + return ret; } static int vhost_scsi_to_tcm_attr(int attr) @@ -1077,7 +1175,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) " %d\n", cmd, exp_data_len, prot_bytes, data_direction); if (data_direction != DMA_NONE) { - if (unlikely(vhost_scsi_mapal(cmd, prot_bytes, + if (unlikely(vhost_scsi_mapal(tpg, cmd, prot_bytes, &prot_iter, exp_data_len, &data_iter))) { vq_err(vq, "Failed to map iov to sgl\n"); @@ -2068,11 +2166,41 @@ static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_show( return sysfs_emit(page, "%d\n", tpg->tv_fabric_prot_type); } - CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, fabric_prot_type); +static ssize_t +vhost_scsi_tpg_attrib_check_io_alignment_store(struct config_item *item, + const char *page, size_t count) +{ + struct se_portal_group *se_tpg = attrib_to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, + se_tpg); + bool val; + int ret; + + ret = kstrtobool(page, &val); + if (ret) + return ret; + + tpg->check_io_alignment = val; + return count; +} + +static ssize_t +vhost_scsi_tpg_attrib_check_io_alignment_show(struct config_item *item, + char *page) +{ + struct se_portal_group *se_tpg = attrib_to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, + se_tpg); + + return sysfs_emit(page, "%d\n", tpg->check_io_alignment); +} +CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, check_io_alignment); + static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = { &vhost_scsi_tpg_attrib_attr_fabric_prot_type, + &vhost_scsi_tpg_attrib_attr_check_io_alignment, NULL, }; -- 2.25.1 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization