If the queue is filled with unaligned writes then the following deadlock occurs: Call Trace: <TASK> __schedule+0x8cc/0x2190 schedule+0xdd/0x2b0 blk_queue_enter+0x2ce/0x4f0 blk_mq_alloc_request+0x303/0x810 scsi_execute_cmd+0x3f4/0x7b0 sd_zbc_do_report_zones+0x19e/0x4c0 sd_zbc_report_zones+0x304/0x920 disk_zone_wplug_handle_error+0x237/0x920 disk_zone_wplugs_work+0x17e/0x430 process_one_work+0xdd0/0x1490 worker_thread+0x5eb/0x1010 kthread+0x2e5/0x3b0 ret_from_fork+0x3a/0x80 </TASK> Fix this deadlock by removing the disk->fops->report_zones() call and by deriving the write pointer information from successfully completed zoned writes. Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> --- block/blk-zoned.c | 69 +++++++++++++++++++---------------------------- block/blk.h | 4 ++- 2 files changed, 31 insertions(+), 42 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index b570b773e65f..284820b29285 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -56,6 +56,8 @@ static const char *const zone_cond_name[] = { * @zone_no: The number of the zone the plug is managing. * @wp_offset: The zone write pointer location relative to the start of the zone * as a number of 512B sectors. + * @wp_offset_compl: End offset for completed zoned writes as a number of 512 + * byte sectors. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. @@ -69,6 +71,7 @@ struct blk_zone_wplug { unsigned int flags; unsigned int zone_no; unsigned int wp_offset; + unsigned int wp_offset_compl; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; @@ -531,6 +534,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, zwplug->flags = 0; zwplug->zone_no = zno; zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); + zwplug->wp_offset_compl = 0; bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); zwplug->disk = disk; @@ -1226,6 +1230,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + unsigned int end_sector; unsigned long flags; if (WARN_ON_ONCE(!zwplug)) @@ -1243,11 +1248,24 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) bio->bi_opf |= REQ_OP_ZONE_APPEND; } - /* - * If the BIO failed, mark the plug as having an error to trigger - * recovery. - */ - if (bio->bi_status != BLK_STS_OK) { + if (bio->bi_status == BLK_STS_OK) { + switch (bio_op(bio)) { + case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: + case REQ_OP_WRITE_ZEROES: + end_sector = bdev_offset_from_zone_start(disk->part0, + bio->bi_iter.bi_sector + bio_sectors(bio)); + if (end_sector > zwplug->wp_offset_compl) + zwplug->wp_offset_compl = end_sector; + break; + default: + break; + } + } else { + /* + * If the BIO failed, mark the plug as having an error to + * trigger recovery. + */ spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_error(disk, zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1388,30 +1406,10 @@ static unsigned int blk_zone_wp_offset(struct blk_zone *zone) } } -static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, - unsigned int idx, void *data) -{ - struct blk_zone *zonep = data; - - *zonep = *zone; - return 0; -} - static void disk_zone_wplug_handle_error(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - sector_t zone_start_sector = - bdev_zone_sectors(disk->part0) * zwplug->zone_no; - unsigned int noio_flag; - struct blk_zone zone; unsigned long flags; - int ret; - - /* Get the current zone information from the device. */ - noio_flag = memalloc_noio_save(); - ret = disk->fops->report_zones(disk, zone_start_sector, 1, - blk_zone_wplug_report_zone_cb, &zone); - memalloc_noio_restore(noio_flag); spin_lock_irqsave(&zwplug->lock, flags); @@ -1425,19 +1423,8 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk, zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; - if (ret != 1) { - /* - * We failed to get the zone information, meaning that something - * is likely really wrong with the device. Abort all remaining - * plugged BIOs as otherwise we could endup waiting forever on - * plugged BIOs to complete if there is a queue freeze on-going. - */ - disk_zone_wplug_abort(zwplug); - goto unplug; - } - /* Update the zone write pointer offset. */ - zwplug->wp_offset = blk_zone_wp_offset(&zone); + zwplug->wp_offset = zwplug->wp_offset_compl; disk_zone_wplug_abort_unaligned(disk, zwplug); /* Restart BIO submission if we still have any BIO left. */ @@ -1446,7 +1433,6 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk, goto unlock; } -unplug: zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; if (disk_should_remove_zone_wplug(disk, zwplug)) disk_remove_zone_wplug(disk, zwplug); @@ -1978,7 +1964,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, struct seq_file *m) { - unsigned int zwp_wp_offset, zwp_flags; + unsigned int zwp_wp_offset, zwp_wp_offset_compl, zwp_flags; unsigned int zwp_zone_no, zwp_ref; unsigned int zwp_bio_list_size; unsigned long flags; @@ -1988,14 +1974,15 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, zwp_flags = zwplug->flags; zwp_ref = refcount_read(&zwplug->ref); zwp_wp_offset = zwplug->wp_offset; + zwp_wp_offset_compl = zwplug->wp_offset_compl; zwp_bio_list_size = bio_list_size(&zwplug->bio_list); spin_unlock_irqrestore(&zwplug->lock, flags); bool all_zwr_inserted = blk_zone_all_zwr_inserted(zwplug); - seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u all_zwr_inserted %d\n", + seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u wp_offset_compl %u bio_list_size %u all_zwr_inserted %d\n", zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset, - zwp_bio_list_size, all_zwr_inserted); + zwp_wp_offset_compl, zwp_bio_list_size, all_zwr_inserted); } int queue_zone_wplugs_show(void *data, struct seq_file *m) diff --git a/block/blk.h b/block/blk.h index be945db6298d..88a6e258eafe 100644 --- a/block/blk.h +++ b/block/blk.h @@ -470,8 +470,10 @@ static inline void blk_zone_update_request_bio(struct request *rq, * the original BIO sector so that blk_zone_write_plug_bio_endio() can * lookup the zone write plug. */ - if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) + if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) { bio->bi_iter.bi_sector = rq->__sector; + bio->bi_iter.bi_size = rq->__data_len; + } } void blk_zone_write_plug_requeue_request(struct request *rq);