Re: [PATCH 1/3] vhost-scsi: Fix alignment handling with windows

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, May 24, 2023 at 06:34:05PM -0500, Mike Christie wrote:
> The linux block layer requires bios/requests to have lengths with a 512
> byte alignment. Some drivers/layers like dm-crypt and the directi IO code
> will test for it and just fail. Other drivers like SCSI just assume the
> requirement is met and will end up in infinte retry loops. The problem
> for drivers like SCSI is that it uses functions like blk_rq_cur_sectors
> and blk_rq_sectors which divide the request's length by 512. If there's
> lefovers then it just gets dropped. But other code in the block/scsi
> layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is
> still data left and try to retry the cmd. We can then end up getting
> stuck in retry loops where part of the block/scsi thinks there is data
> left, but other parts think we want to do IOs of zero length.
> 
> Linux will always check for alignment, but windows will not. When
> vhost-scsi then translates the iovec it gets from a windows guest to a
> scatterlist, we can end up with sg items where the sg->length is not
> divisible by 512 due to the misaligned offset:
> 
> sg[0].offset = 255;
> sg[0].length = 3841;
> sg...
> sg[N].offset = 0;
> sg[N].length = 255;
> 
> When the lio backends then convert the SG to bios or other iovecs, we
> end up sending them with the same misaligned values and can hit the
> issues above.
> 
> This just has us drop down to allocating a temp page and copying the data
> when this happens. Because this adds a check that needs to loop over the
> iovec in the main IO path, this patch adds an attribute that can be turned
> on for just windows.
> 
> Signed-off-by: Mike Christie <michael.christie@xxxxxxxxxx>

I am assuming this triggers rarely, yes?

If so I would like to find a way to avoid setting an attribute.

I am guessing the main cost is an extra scan of iov.
vhost_scsi_iov_to_sgl already does a scan, how about changing
it to fail if misaligned?
We can then detect the relevant error code and try with a copy.

WDYT?

> ---
>  drivers/vhost/scsi.c | 174 +++++++++++++++++++++++++++++++++++++------
>  1 file changed, 151 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index bb10fa4bb4f6..dbad8fb579eb 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -25,6 +25,8 @@
>  #include <linux/fs.h>
>  #include <linux/vmalloc.h>
>  #include <linux/miscdevice.h>
> +#include <linux/blk_types.h>
> +#include <linux/bio.h>
>  #include <asm/unaligned.h>
>  #include <scsi/scsi_common.h>
>  #include <scsi/scsi_proto.h>
> @@ -75,6 +77,9 @@ struct vhost_scsi_cmd {
>  	u32 tvc_prot_sgl_count;
>  	/* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */
>  	u32 tvc_lun;
> +	u32 copied_iov:1;
> +	const void *saved_iter_addr;
> +	struct iov_iter saved_iter;
>  	/* Pointer to the SGL formatted memory from virtio-scsi */
>  	struct scatterlist *tvc_sgl;
>  	struct scatterlist *tvc_prot_sgl;
> @@ -107,6 +112,7 @@ struct vhost_scsi_nexus {
>  struct vhost_scsi_tpg {
>  	/* Vhost port target portal group tag for TCM */
>  	u16 tport_tpgt;
> +	bool check_io_alignment;
>  	/* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
>  	int tv_tpg_port_count;
>  	/* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
> @@ -328,8 +334,13 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd)
>  	int i;
>  
>  	if (tv_cmd->tvc_sgl_count) {
> -		for (i = 0; i < tv_cmd->tvc_sgl_count; i++)
> -			put_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +		for (i = 0; i < tv_cmd->tvc_sgl_count; i++) {
> +			if (tv_cmd->copied_iov)
> +				__free_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +			else
> +				put_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +		}
> +		kfree(tv_cmd->saved_iter_addr);
>  	}
>  	if (tv_cmd->tvc_prot_sgl_count) {
>  		for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++)
> @@ -502,6 +513,27 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
>  	mutex_unlock(&vq->mutex);
>  }
>  
> +static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd)
> +{
> +	struct iov_iter *iter = &cmd->saved_iter;
> +	struct scatterlist *sg = cmd->tvc_sgl;
> +	struct page *page;
> +	size_t len;
> +	int i;
> +
> +	for (i = 0; i < cmd->tvc_sgl_count; i++) {
> +		page = sg_page(&sg[i]);
> +		len = sg[i].length;
> +
> +		if (copy_page_to_iter(page, 0, len, iter) != len) {
> +			pr_err("Could not copy data. Error %lu\n", len);
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  /* Fill in status and signal that we are done processing this command
>   *
>   * This is scheduled in the vhost work queue so we are called with the owner
> @@ -525,15 +557,20 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
>  
>  		pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__,
>  			cmd, se_cmd->residual_count, se_cmd->scsi_status);
> -
>  		memset(&v_rsp, 0, sizeof(v_rsp));
> -		v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count);
> -		/* TODO is status_qualifier field needed? */
> -		v_rsp.status = se_cmd->scsi_status;
> -		v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
> -						 se_cmd->scsi_sense_length);
> -		memcpy(v_rsp.sense, cmd->tvc_sense_buf,
> -		       se_cmd->scsi_sense_length);
> +
> +		if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) {
> +			v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET;
> +		} else {
> +			v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq,
> +						     se_cmd->residual_count);
> +			/* TODO is status_qualifier field needed? */
> +			v_rsp.status = se_cmd->scsi_status;
> +			v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
> +							 se_cmd->scsi_sense_length);
> +			memcpy(v_rsp.sense, cmd->tvc_sense_buf,
> +			       se_cmd->scsi_sense_length);
> +		}
>  
>  		iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov,
>  			      cmd->tvc_in_iovs, sizeof(v_rsp));
> @@ -682,7 +719,52 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
>  }
>  
>  static int
> -vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
> +vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter,
> +			   struct scatterlist *sg, int sg_count)
> +{
> +	size_t len = iov_iter_count(iter);
> +	unsigned int nbytes = 0;
> +	struct page *page;
> +	int i;
> +
> +	if (cmd->tvc_data_direction == DMA_FROM_DEVICE) {
> +		cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter,
> +						GFP_KERNEL);
> +		if (!cmd->saved_iter_addr)
> +			return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < sg_count; i++) {
> +		page = alloc_page(GFP_KERNEL);
> +		if (!page) {
> +			i--;
> +			goto err;
> +		}
> +
> +		nbytes = min_t(unsigned int, PAGE_SIZE, len);
> +		sg_set_page(&sg[i], page, nbytes, 0);
> +
> +		if (cmd->tvc_data_direction == DMA_TO_DEVICE &&
> +		    copy_page_from_iter(page, 0, nbytes, iter) != nbytes)
> +			goto err;
> +
> +		len -= nbytes;
> +	}
> +
> +	cmd->copied_iov = 1;
> +	return 0;
> +
> +err:
> +	pr_err("Could not read %u bytes\n", nbytes);
> +
> +	for (; i >= 0; i--)
> +		__free_page(sg_page(&sg[i]));
> +	kfree(cmd->saved_iter_addr);
> +	return -ENOMEM;
> +}
> +
> +static int
> +vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd,
>  		 size_t prot_bytes, struct iov_iter *prot_iter,
>  		 size_t data_bytes, struct iov_iter *data_iter)
>  {
> @@ -703,10 +785,8 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
>  		ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter,
>  					    cmd->tvc_prot_sgl,
>  					    cmd->tvc_prot_sgl_count);
> -		if (ret < 0) {
> -			cmd->tvc_prot_sgl_count = 0;
> -			return ret;
> -		}
> +		if (ret < 0)
> +			goto map_fail;
>  	}
>  	sgl_count = vhost_scsi_calc_sgls(data_iter, data_bytes,
>  					 VHOST_SCSI_PREALLOC_SGLS);
> @@ -717,14 +797,32 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
>  	cmd->tvc_sgl_count = sgl_count;
>  	pr_debug("%s data_sg %p data_sgl_count %u\n", __func__,
>  		  cmd->tvc_sgl, cmd->tvc_sgl_count);
> +	/*
> +	 * The block layer requires bios/requests to be a multiple of 512 bytes,
> +	 * but Windows can send us vecs that are misaligned. This can result
> +	 * in bios and later requests with misaligned sizes if we have to break
> +	 * up a cmd into multiple bios.
> +	 *
> +	 * We currently only break up a command into multiple bios if we hit
> +	 * the vec/seg limit, so check if our sgl_count is greater than the max
> +	 * and if a vec in the cmd has a misaligned size.
> +	 */
> +	if (tpg->check_io_alignment &&
> +	    (!iov_iter_is_aligned(data_iter, 0, SECTOR_SIZE - 1) &&
> +	     bio_max_segs(sgl_count) != sgl_count))
> +		ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl,
> +						 cmd->tvc_sgl_count);
> +	else
> +		ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
> +					    cmd->tvc_sgl, cmd->tvc_sgl_count);
> +	if (ret)
> +		goto map_fail;
>  
> -	ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
> -				    cmd->tvc_sgl, cmd->tvc_sgl_count);
> -	if (ret < 0) {
> -		cmd->tvc_sgl_count = 0;
> -		return ret;
> -	}
>  	return 0;
> +
> +map_fail:
> +	cmd->tvc_sgl_count = 0;
> +	return ret;
>  }
>  
>  static int vhost_scsi_to_tcm_attr(int attr)
> @@ -1077,7 +1175,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
>  			 " %d\n", cmd, exp_data_len, prot_bytes, data_direction);
>  
>  		if (data_direction != DMA_NONE) {
> -			if (unlikely(vhost_scsi_mapal(cmd, prot_bytes,
> +			if (unlikely(vhost_scsi_mapal(tpg, cmd, prot_bytes,
>  						      &prot_iter, exp_data_len,
>  						      &data_iter))) {
>  				vq_err(vq, "Failed to map iov to sgl\n");
> @@ -2068,11 +2166,41 @@ static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_show(
>  
>  	return sysfs_emit(page, "%d\n", tpg->tv_fabric_prot_type);
>  }
> -
>  CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, fabric_prot_type);
>  
> +static ssize_t
> +vhost_scsi_tpg_attrib_check_io_alignment_store(struct config_item *item,
> +					       const char *page, size_t count)
> +{
> +	struct se_portal_group *se_tpg = attrib_to_tpg(item);
> +	struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
> +						  se_tpg);
> +	bool val;
> +	int ret;
> +
> +	ret = kstrtobool(page, &val);
> +	if (ret)
> +		return ret;
> +
> +	tpg->check_io_alignment = val;
> +	return count;
> +}
> +
> +static ssize_t
> +vhost_scsi_tpg_attrib_check_io_alignment_show(struct config_item *item,
> +					      char *page)
> +{
> +	struct se_portal_group *se_tpg = attrib_to_tpg(item);
> +	struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
> +						  se_tpg);
> +
> +	return sysfs_emit(page, "%d\n", tpg->check_io_alignment);
> +}
> +CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, check_io_alignment);
> +
>  static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = {
>  	&vhost_scsi_tpg_attrib_attr_fabric_prot_type,
> +	&vhost_scsi_tpg_attrib_attr_check_io_alignment,
>  	NULL,
>  };
>  
> -- 
> 2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization



[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux