On 2018/04/16 18:04, Bart Van Assche wrote: > Since SCSI scanning occurs asynchronously, since sd_revalidate_disk() > is called from sd_probe_async() and since sd_revalidate_disk() calls > sd_zbc_read_zones() it can happen that sd_zbc_read_zones() is called > concurrently with blkdev_report_zones() and/or blkdev_reset_zones(). > That can cause these functions to fail with -EIO because > sd_zbc_read_zones() e.g. sets q->nr_zones to zero before restoring it > to the actual value, even if no drive characteristics have changed. > Avoid that this can happen by making the following changes: > - Protect the code that updates zone information with blk_queue_enter() > and blk_queue_exit(). > - Modify sd_zbc_setup_seq_zones_bitmap() and sd_zbc_setup() such that > these functions do not modify struct scsi_disk before all zone > information has been obtained. > > Note: since commit 055f6e18e08f ("block: Make q_usage_counter also > track legacy requests"; kernel v4.15) the request queue freezing > mechanism also affects legacy request queues. > > Fixes: 89d947561077 ("sd: Implement support for ZBC devices") > Signed-off-by: Bart Van Assche <bart.vanassche@xxxxxxx> > Cc: Jens Axboe <axboe@xxxxxxxxx> > Cc: Damien Le Moal <damien.lemoal@xxxxxxx> > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx # v4.10 > --- > drivers/scsi/sd_zbc.c | 140 +++++++++++++++++++++++++++++-------------------- > include/linux/blkdev.h | 5 ++ > 2 files changed, 87 insertions(+), 58 deletions(-) > > diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c > index 2d0c06f7db3e..323e3dc4bc59 100644 > --- a/drivers/scsi/sd_zbc.c > +++ b/drivers/scsi/sd_zbc.c > @@ -390,8 +390,10 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf) > * > * Check that all zones of the device are equal. The last zone can however > * be smaller. The zone size must also be a power of two number of LBAs. > + * > + * Returns the zone size in bytes upon success or an error code upon failure. > */ > -static int sd_zbc_check_zone_size(struct scsi_disk *sdkp) > +static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp) > { > u64 zone_blocks = 0; > sector_t block = 0; > @@ -402,8 +404,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp) > int ret; > u8 same; > > - sdkp->zone_blocks = 0; > - > /* Get a buffer */ > buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); > if (!buf) > @@ -435,16 +435,17 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp) > > /* Parse zone descriptors */ > while (rec < buf + buf_len) { > - zone_blocks = get_unaligned_be64(&rec[8]); > - if (sdkp->zone_blocks == 0) { > - sdkp->zone_blocks = zone_blocks; > - } else if (zone_blocks != sdkp->zone_blocks && > - (block + zone_blocks < sdkp->capacity > - || zone_blocks > sdkp->zone_blocks)) { > - zone_blocks = 0; > + u64 this_zone_blocks = get_unaligned_be64(&rec[8]); > + > + if (zone_blocks == 0) { > + zone_blocks = this_zone_blocks; > + } else if (this_zone_blocks != zone_blocks && > + (block + this_zone_blocks < sdkp->capacity > + || this_zone_blocks > zone_blocks)) { > + this_zone_blocks = 0; > goto out; > } > - block += zone_blocks; > + block += this_zone_blocks; > rec += 64; > } > > @@ -457,8 +458,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp) > > } while (block < sdkp->capacity); > > - zone_blocks = sdkp->zone_blocks; > - > out: > if (!zone_blocks) { > if (sdkp->first_scan) > @@ -478,8 +477,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp) > "Zone size too large\n"); > ret = -ENODEV; > } else { > - sdkp->zone_blocks = zone_blocks; > - sdkp->zone_shift = ilog2(zone_blocks); > + ret = zone_blocks; > } > > out_free: > @@ -490,15 +488,14 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp) > > /** > * sd_zbc_alloc_zone_bitmap - Allocate a zone bitmap (one bit per zone). > - * @sdkp: The disk of the bitmap > + * @nr_zones: Number of zones to allocate space for. > + * @numa_node: NUMA node to allocate the memory from. > */ > -static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp) > +static inline unsigned long * > +sd_zbc_alloc_zone_bitmap(u32 nr_zones, int numa_node) > { > - struct request_queue *q = sdkp->disk->queue; > - > - return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones) > - * sizeof(unsigned long), > - GFP_KERNEL, q->node); > + return kzalloc_node(BITS_TO_LONGS(nr_zones) * sizeof(unsigned long), > + GFP_KERNEL, numa_node); > } > > /** > @@ -506,6 +503,7 @@ static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp) > * @sdkp: disk used > * @buf: report reply buffer > * @buflen: length of @buf > + * @zone_shift: logarithm base 2 of the number of blocks in a zone > * @seq_zones_bitmap: bitmap of sequential zones to set > * > * Parse reported zone descriptors in @buf to identify sequential zones and > @@ -515,7 +513,7 @@ static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp) > * Return the LBA after the last zone reported. > */ > static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char *buf, > - unsigned int buflen, > + unsigned int buflen, u32 zone_shift, > unsigned long *seq_zones_bitmap) > { > sector_t lba, next_lba = sdkp->capacity; > @@ -534,7 +532,7 @@ static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char *buf, > if (type != ZBC_ZONE_TYPE_CONV && > cond != ZBC_ZONE_COND_READONLY && > cond != ZBC_ZONE_COND_OFFLINE) > - set_bit(lba >> sdkp->zone_shift, seq_zones_bitmap); > + set_bit(lba >> zone_shift, seq_zones_bitmap); > next_lba = lba + get_unaligned_be64(&rec[8]); > rec += 64; > } > @@ -543,12 +541,16 @@ static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char *buf, > } > > /** > - * sd_zbc_setup_seq_zones_bitmap - Initialize the disk seq zone bitmap. > + * sd_zbc_setup_seq_zones_bitmap - Initialize a seq zone bitmap. > * @sdkp: target disk > + * @zone_shift: logarithm base 2 of the number of blocks in a zone > + * @nr_zones: number of zones to set up a seq zone bitmap for > * > * Allocate a zone bitmap and initialize it by identifying sequential zones. > */ > -static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp) > +static unsigned long * > +sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp, u32 zone_shift, > + u32 nr_zones) > { > struct request_queue *q = sdkp->disk->queue; > unsigned long *seq_zones_bitmap; > @@ -556,9 +558,9 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp) > unsigned char *buf; > int ret = -ENOMEM; > > - seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(sdkp); > + seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(nr_zones, q->node); > if (!seq_zones_bitmap) > - return -ENOMEM; > + return ERR_PTR(-ENOMEM); > > buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); > if (!buf) > @@ -569,7 +571,7 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp) > if (ret) > goto out; > lba = sd_zbc_get_seq_zones(sdkp, buf, SD_ZBC_BUF_SIZE, > - seq_zones_bitmap); > + zone_shift, seq_zones_bitmap); > } > > if (lba != sdkp->capacity) { > @@ -581,12 +583,9 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp) > kfree(buf); > if (ret) { > kfree(seq_zones_bitmap); > - return ret; > + return ERR_PTR(ret); > } > - > - q->seq_zones_bitmap = seq_zones_bitmap; > - > - return 0; > + return seq_zones_bitmap; > } > > static void sd_zbc_cleanup(struct scsi_disk *sdkp) > @@ -602,44 +601,64 @@ static void sd_zbc_cleanup(struct scsi_disk *sdkp) > q->nr_zones = 0; > } > > -static int sd_zbc_setup(struct scsi_disk *sdkp) > +static int sd_zbc_setup(struct scsi_disk *sdkp, u32 zone_blocks) > { > struct request_queue *q = sdkp->disk->queue; > + u32 zone_shift = ilog2(zone_blocks); > + u32 nr_zones; > int ret; > > - /* READ16/WRITE16 is mandatory for ZBC disks */ > - sdkp->device->use_16_for_rw = 1; > - sdkp->device->use_10_for_rw = 0; > - > /* chunk_sectors indicates the zone size */ > - blk_queue_chunk_sectors(sdkp->disk->queue, > - logical_to_sectors(sdkp->device, sdkp->zone_blocks)); > - sdkp->nr_zones = > - round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift; > + blk_queue_chunk_sectors(q, > + logical_to_sectors(sdkp->device, zone_blocks)); > + nr_zones = round_up(sdkp->capacity, zone_blocks) >> zone_shift; > > /* > * Initialize the device request queue information if the number > * of zones changed. > */ > - if (sdkp->nr_zones != q->nr_zones) { > - > - sd_zbc_cleanup(sdkp); > - > - q->nr_zones = sdkp->nr_zones; > - if (sdkp->nr_zones) { > - q->seq_zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp); > - if (!q->seq_zones_wlock) { > + if (nr_zones != sdkp->nr_zones || nr_zones != q->nr_zones) { > + unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; > + size_t zone_bitmap_size; > + > + if (nr_zones) { > + seq_zones_wlock = sd_zbc_alloc_zone_bitmap(nr_zones, > + q->node); > + if (!seq_zones_wlock) { > ret = -ENOMEM; > goto err; > } > > - ret = sd_zbc_setup_seq_zones_bitmap(sdkp); > - if (ret) { > - sd_zbc_cleanup(sdkp); > + seq_zones_bitmap = sd_zbc_setup_seq_zones_bitmap(sdkp, > + zone_shift, nr_zones); > + if (IS_ERR(seq_zones_bitmap)) { > + ret = PTR_ERR(seq_zones_bitmap); > + kfree(seq_zones_wlock); > goto err; > } > } > - > + zone_bitmap_size = BITS_TO_LONGS(nr_zones) * > + sizeof(unsigned long); > + blk_mq_freeze_queue(q); > + if (q->nr_zones != nr_zones) { > + /* READ16/WRITE16 is mandatory for ZBC disks */ > + sdkp->device->use_16_for_rw = 1; > + sdkp->device->use_10_for_rw = 0; > + > + sdkp->zone_blocks = zone_blocks; > + sdkp->zone_shift = zone_shift; > + sdkp->nr_zones = nr_zones; > + q->nr_zones = nr_zones; > + swap(q->seq_zones_wlock, seq_zones_wlock); > + swap(q->seq_zones_bitmap, seq_zones_bitmap); > + } else if (memcmp(q->seq_zones_bitmap, seq_zones_bitmap, > + zone_bitmap_size) != 0) { > + memcpy(q->seq_zones_bitmap, seq_zones_bitmap, > + zone_bitmap_size); > + } > + blk_mq_unfreeze_queue(q); > + kfree(seq_zones_wlock); > + kfree(seq_zones_bitmap); > } > > return 0; > @@ -651,6 +670,7 @@ static int sd_zbc_setup(struct scsi_disk *sdkp) > > int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) > { > + int64_t zone_blocks; > int ret; > > if (!sd_is_zoned(sdkp)) > @@ -687,12 +707,16 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) > * Check zone size: only devices with a constant zone size (except > * an eventual last runt zone) that is a power of 2 are supported. > */ > - ret = sd_zbc_check_zone_size(sdkp); > - if (ret) > + zone_blocks = sd_zbc_check_zone_size(sdkp); > + ret = -EFBIG; > + if (zone_blocks != (u32)zone_blocks) > + goto err; > + ret = zone_blocks; > + if (ret < 0) > goto err; > > /* The drive satisfies the kernel restrictions: set it up */ > - ret = sd_zbc_setup(sdkp); > + ret = sd_zbc_setup(sdkp, zone_blocks); > if (ret) > goto err; > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 9af3e0f430bc..21e21f273a21 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -605,6 +605,11 @@ struct request_queue { > * initialized by the low level device driver (e.g. scsi/sd.c). > * Stacking drivers (device mappers) may or may not initialize > * these fields. > + * > + * Reads of this information must be protected with blk_queue_enter() / > + * blk_queue_exit(). Modifying this information is only allowed while > + * no requests are being processed. See also blk_mq_freeze_queue() and > + * blk_mq_unfreeze_queue(). > */ > unsigned int nr_zones; > unsigned long *seq_zones_bitmap; > Reviewed-by: Damien Le Moal <damien.lemoal@xxxxxxx> Note: This will not apply as is for stable kernels 4.10 to 4.15 since these do not have zone information attached to the request queue. -- Damien Le Moal Western Digital Research