On 2020-05-15 03:30, Avri Altman wrote: > @@ -17,6 +18,13 @@ > > #define UFSHPB_NAME "ufshpb" > > +#define UFSHPB_WRITE_BUFFER (0xfa) > +#define WRITE_BUFFER_TIMEOUT (3 * HZ) > +#define WRITE_BUFFER_RETRIES (3) > +#define UFSHPB_READ_BUFFER (0xf9) > +#define READ_BUFFER_TIMEOUT (3 * HZ) > +#define READ_BUFFER_RETRIES (3) Parentheses around expressions are normal but parentheses around constants are unusual. I think the parentheses around constants can be left out. > +#define to_subregion() (container_of(work, struct ufshpb_subregion, hpb_work)) Could this have been defined as an inline function? > @@ -76,6 +118,7 @@ struct ufshpb_subregion { > * @writes - sum over subregions @writes > * @region - region index > * @active_subregions - actual active subregions > + * @evicted - to indicated if this region is currently being evicted > */ > struct ufshpb_region { > struct ufshpb_subregion *subregion_tbl; > @@ -85,6 +128,7 @@ struct ufshpb_region { > unsigned int region; > > atomic_t active_subregions; > + atomic_t evicted; > }; Declaring a state variable as atomic_t is unusual. How are changes of the @evicted member variable serialized? > /** > @@ -93,6 +137,7 @@ struct ufshpb_region { > * @lh_map_ctx - list head of mapping context > * @map_list_lock - to protect mapping context list operations > * @region_tbl - regions/subregions table > + * @pinned_map - to mark pinned regions > * @sdev - scsi device for that lun > * @regions_per_lun > * @subregions_per_lun - lun size is not guaranteed to be region aligned > @@ -105,6 +150,7 @@ struct ufshpb_dh_lun { > struct list_head lh_map_ctx; > spinlock_t map_list_lock; > struct ufshpb_region *region_tbl; > + unsigned long *pinned_map; > struct scsi_device *sdev; > > unsigned int regions_per_lun; > @@ -113,6 +159,10 @@ struct ufshpb_dh_lun { > unsigned int max_active_regions; > > atomic_t active_regions; > + > + struct mutex eviction_lock; > + > + struct workqueue_struct *wq; > }; Please document what the eviction_lock protects. > +static inline void ufshpb_set_write_buf_cmd(unsigned char *cmd, > + unsigned int region) > +{ > + cmd[0] = UFSHPB_WRITE_BUFFER; > + cmd[1] = 0x01; > + put_unaligned_be16(region, &cmd[2]); > +} Please follow the example of the sd driver and use the verb "setup" instead of "set" for functions that initialize a SCSI CDB. > +static int ufshpb_submit_write_buf_cmd(struct scsi_device *sdev, > + unsigned int region) > +{ > + unsigned char cmd[10] = {}; > + struct scsi_sense_hdr sshdr = {}; > + u64 flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | > + REQ_FAILFAST_DRIVER; > + int timeout = WRITE_BUFFER_TIMEOUT; > + int cmd_retries = WRITE_BUFFER_RETRIES; > + int ret = 0; > + > + ufshpb_set_write_buf_cmd(cmd, region); > + > + ret = scsi_execute(sdev, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, > + timeout, cmd_retries, flags, 0, NULL); > + > + /* HPB spec does not define any error handling */ > + sdev_printk(KERN_INFO, sdev, "%s: WRITE_BUFFER %s result %d\n", > + UFSHPB_NAME, ret ? "failed" : "succeeded", ret); > + > + return ret; > +} I don't think that unconditionally printing the result of the WRITE BUFFER command is acceptable. How about only reporting failures? > +static void ufshpb_set_read_buf_cmd(unsigned char *cmd, unsigned int region, > + unsigned int subregion, > + unsigned int alloc_len) > +{ > + cmd[0] = UFSHPB_READ_BUFFER; > + cmd[1] = 0x01; > + put_unaligned_be16(region, &cmd[2]); > + put_unaligned_be16(subregion, &cmd[4]); > + > + cmd[6] = alloc_len >> 16; > + cmd[7] = (alloc_len >> 8) & 0xff; > + cmd[8] = alloc_len & 0xff; > + cmd[9] = 0x00; > +} Please use put_unaligned_be24() instead of open-coding it. > +static int ufshpb_subregion_alloc_pages(struct ufshpb_dh_lun *hpb, > + struct ufshpb_subregion *s) > +{ > + struct ufshpb_map_ctx *mctx; > + > + spin_lock(&hpb->map_list_lock); > + mctx = list_first_entry_or_null(&hpb->lh_map_ctx, > + struct ufshpb_map_ctx, list); > + if (!mctx) { > + spin_unlock(&hpb->map_list_lock); > + return -EINVAL; > + } > + > + list_del_init(&mctx->list); > + spin_unlock(&hpb->map_list_lock); > + > + s->mctx = mctx; > + mctx->pages = (char *)__get_free_pages(GFP_KERNEL, order); > + if (!mctx->pages) > + return -ENOMEM; > + > + return 0; > +} Relying on higher order pages is not acceptable because memory gets fragmented easily. See also https://elinux.org/images/a/a8/Controlling_Linux_Memory_Fragmentation_and_Higher_Order_Allocation_Failure-_Analysis%2C_Observations_and_Results.pdf. > + hpb->pinned_map = kcalloc(BITS_TO_LONGS(hpb->regions_per_lun), > + sizeof(unsigned long), GFP_KERNEL); Is this perhaps an open-coded version of bitmap_alloc()? If so, please use bitmap_alloc() instead. > + snprintf(wq_name, ARRAY_SIZE(wq_name), "ufshpb_wq_%d", sdev->id); > + wq = alloc_workqueue(wq_name, WQ_HIGHPRI, WQ_MAX_ACTIVE); > + if (!wq) { > + ret = -ENOMEM; > + goto out_free; > + } What is the purpose of the ufshpb_wq_%d workqueues? Why to allocate dedicated workqueues instead of using one of the existing system workqueues? If the scsi_execute() calls would be changed into asynchronous SCSI command submission, would these workqueues still be necessary? Thanks, Bart.