Re: [RFC PATCH 07/13] scsi: scsi_dh: ufshpb: Add ufshpb state machine

Bart Van Assche <bvanassche@xxxxxxx> · Fri, 15 May 2020 19:44:20 -0700

On 2020-05-15 03:30, Avri Altman wrote:
> @@ -17,6 +18,13 @@
>  
>  #define UFSHPB_NAME	"ufshpb"
>  
> +#define UFSHPB_WRITE_BUFFER (0xfa)
> +#define WRITE_BUFFER_TIMEOUT (3 * HZ)
> +#define WRITE_BUFFER_RETRIES (3)
> +#define UFSHPB_READ_BUFFER (0xf9)
> +#define READ_BUFFER_TIMEOUT (3 * HZ)
> +#define READ_BUFFER_RETRIES (3)

Parentheses around expressions are normal but parentheses around
constants are unusual. I think the parentheses around constants can be
left out.

> +#define to_subregion() (container_of(work, struct ufshpb_subregion, hpb_work))

Could this have been defined as an inline function?

> @@ -76,6 +118,7 @@ struct ufshpb_subregion {
>   * @writes - sum over subregions @writes
>   * @region - region index
>   * @active_subregions - actual active subregions
> + * @evicted - to indicated if this region is currently being evicted
>   */
>  struct ufshpb_region {
>  	struct ufshpb_subregion *subregion_tbl;
> @@ -85,6 +128,7 @@ struct ufshpb_region {
>  	unsigned int region;
>  
>  	atomic_t active_subregions;
> +	atomic_t evicted;
>  };

Declaring a state variable as atomic_t is unusual. How are changes of
the @evicted member variable serialized?

>  /**
> @@ -93,6 +137,7 @@ struct ufshpb_region {
>   * @lh_map_ctx - list head of mapping context
>   * @map_list_lock - to protect mapping context list operations
>   * @region_tbl - regions/subregions table
> + * @pinned_map - to mark pinned regions
>   * @sdev - scsi device for that lun
>   * @regions_per_lun
>   * @subregions_per_lun - lun size is not guaranteed to be region aligned
> @@ -105,6 +150,7 @@ struct ufshpb_dh_lun {
>  	struct list_head lh_map_ctx;
>  	spinlock_t map_list_lock;
>  	struct ufshpb_region *region_tbl;
> +	unsigned long *pinned_map;
>  	struct scsi_device *sdev;
>  
>  	unsigned int regions_per_lun;
> @@ -113,6 +159,10 @@ struct ufshpb_dh_lun {
>  	unsigned int max_active_regions;
>  
>  	atomic_t active_regions;
> +
> +	struct mutex eviction_lock;
> +
> +	struct workqueue_struct *wq;
>  };

Please document what the eviction_lock protects.

> +static inline void ufshpb_set_write_buf_cmd(unsigned char *cmd,
> +					    unsigned int region)
> +{
> +	cmd[0] = UFSHPB_WRITE_BUFFER;
> +	cmd[1] = 0x01;
> +	put_unaligned_be16(region, &cmd[2]);
> +}

Please follow the example of the sd driver and use the verb "setup"
instead of "set" for functions that initialize a SCSI CDB.

> +static int ufshpb_submit_write_buf_cmd(struct scsi_device *sdev,
> +				       unsigned int region)
> +{
> +	unsigned char cmd[10] = {};
> +	struct scsi_sense_hdr sshdr = {};
> +	u64 flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
> +		    REQ_FAILFAST_DRIVER;
> +	int timeout = WRITE_BUFFER_TIMEOUT;
> +	int cmd_retries = WRITE_BUFFER_RETRIES;
> +	int ret = 0;
> +
> +	ufshpb_set_write_buf_cmd(cmd, region);
> +
> +	ret = scsi_execute(sdev, cmd, DMA_NONE, NULL, 0, NULL, &sshdr,
> +			   timeout, cmd_retries, flags, 0, NULL);
> +
> +	/* HPB spec does not define any error handling */
> +	sdev_printk(KERN_INFO, sdev, "%s: WRITE_BUFFER %s result %d\n",
> +		    UFSHPB_NAME, ret ? "failed" : "succeeded", ret);
> +
> +	return ret;
> +}

I don't think that unconditionally printing the result of the WRITE
BUFFER command is acceptable. How about only reporting failures?

> +static void ufshpb_set_read_buf_cmd(unsigned char *cmd, unsigned int region,
> +				    unsigned int subregion,
> +				    unsigned int alloc_len)
> +{
> +	cmd[0] = UFSHPB_READ_BUFFER;
> +	cmd[1] = 0x01;
> +	put_unaligned_be16(region, &cmd[2]);
> +	put_unaligned_be16(subregion, &cmd[4]);
> +
> +	cmd[6] = alloc_len >> 16;
> +	cmd[7] = (alloc_len >> 8) & 0xff;
> +	cmd[8] = alloc_len & 0xff;
> +	cmd[9] = 0x00;
> +}

Please use put_unaligned_be24() instead of open-coding it.

> +static int ufshpb_subregion_alloc_pages(struct ufshpb_dh_lun *hpb,
> +					struct ufshpb_subregion *s)
> +{
> +	struct ufshpb_map_ctx *mctx;
> +
> +	spin_lock(&hpb->map_list_lock);
> +	mctx = list_first_entry_or_null(&hpb->lh_map_ctx,
> +					struct ufshpb_map_ctx, list);
> +	if (!mctx) {
> +		spin_unlock(&hpb->map_list_lock);
> +		return -EINVAL;
> +	}
> +
> +	list_del_init(&mctx->list);
> +	spin_unlock(&hpb->map_list_lock);
> +
> +	s->mctx = mctx;
> +	mctx->pages = (char *)__get_free_pages(GFP_KERNEL, order);
> +	if (!mctx->pages)
> +		return -ENOMEM;
> +
> +	return 0;
> +}

Relying on higher order pages is not acceptable because memory gets
fragmented easily. See also
https://elinux.org/images/a/a8/Controlling_Linux_Memory_Fragmentation_and_Higher_Order_Allocation_Failure-_Analysis%2C_Observations_and_Results.pdf.

> +	hpb->pinned_map = kcalloc(BITS_TO_LONGS(hpb->regions_per_lun),
> +				  sizeof(unsigned long), GFP_KERNEL);

Is this perhaps an open-coded version of bitmap_alloc()? If so, please
use bitmap_alloc() instead.

> +	snprintf(wq_name, ARRAY_SIZE(wq_name), "ufshpb_wq_%d", sdev->id);
> +	wq = alloc_workqueue(wq_name, WQ_HIGHPRI, WQ_MAX_ACTIVE);
> +	if (!wq) {
> +		ret = -ENOMEM;
> +		goto out_free;
> +	}

What is the purpose of the ufshpb_wq_%d workqueues? Why to allocate
dedicated workqueues instead of using one of the existing system
workqueues? If the scsi_execute() calls would be changed into
asynchronous SCSI command submission, would these workqueues still be
necessary?

Thanks,

Bart.