A closer look at ioc_qos_write shows that correcting the lock order is non-trivial because q->rq_qos_mutex is acquired in blkg_conf_open_bdev and released in blkg_conf_exit. The function blkg_conf_open_bdev is responsible for parsing user input and finding the corresponding block device (bdev) from the user provided major:minor number. Since we do not know the bdev until blkg_conf_open_bdev completes, we cannot simply move q->elevator_lock acquisition before blkg_conf_open_ bdev. So to address this, we intoduce new helpers blkg_conf_open_bdev_ frozen and blkg_conf_exit_frozen which are just wrappers around blkg_ conf_open_bdev and blkg_conf_exit respectively. The helper blkg_conf_ open_bdev_frozen is similar to blkg_conf_open_bdev, but additionally freezes the queue, acquires q->elevator_lock and ensures the correct locking order is followed between q->elevator_lock and q->rq_qos_mutex. Similarly another helper blkg_conf_exit_frozen in addition to unfreezing the queue ensures that we release the locks in correct order. By using these helpers, now we maintain the same locking order in all code paths where we update blk-wbt parameters. Fixes: 245618f8e45f ("block: protect wbt_lat_usec using q->elevator_lock") Reported-by: kernel test robot <oliver.sang@xxxxxxxxx> Closes: https://lore.kernel.org/oe-lkp/202503171650.cc082b66-lkp@xxxxxxxxx Signed-off-by: Nilay Shroff <nilay@xxxxxxxxxxxxx> --- block/blk-cgroup.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++ block/blk-cgroup.h | 2 ++ block/blk-iocost.c | 18 +++++----------- 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9ed93d91d754..31d40879c346 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -815,6 +815,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) ctx->bdev = bdev; return 0; } +/* + * Similar to blkg_conf_open_bdev, but additionally freezes the queue, + * acquires q->elevator_lock, and ensures the correct locking order + * between q->elevator_lock and q->rq_qos_mutex. + * + * This function returns negative error on failure. On success it returns + * memflags which must be saved and later passed to blkg_conf_exit_frozen + * for restoring the memalloc scope. + */ +unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) +{ + int ret; + unsigned long memflags; + + if (ctx->bdev) + return -EINVAL; + + ret = blkg_conf_open_bdev(ctx); + if (ret < 0) + return ret; + /* + * At this point, we havenâ??t started protecting anything related to QoS, + * so we release q->rq_qos_mutex here, which was first acquired in blkg_ + * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing + * the queue and acquiring q->elevator_lock to maintain the correct + * locking order. + */ + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + + memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); + mutex_lock(&ctx->bdev->bd_queue->elevator_lock); + mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); + + return memflags; +} /** * blkg_conf_prep - parse and prepare for per-blkg config update @@ -971,6 +1006,22 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_exit); +/* + * Similar to blkg_conf_exit, but also unfreezes the queue and releases + * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen + * is used to open the bdev. + */ +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) +{ + if (ctx->bdev) { + struct request_queue *q = ctx->bdev->bd_queue; + + blkg_conf_exit(ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + } +} + static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2c4663bd993a..81868ad86330 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -219,9 +219,11 @@ struct blkg_conf_ctx { void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); +unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 56e6fb51316d..3724b0308cd8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3223,13 +3223,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; - unsigned int memflags; + unsigned long memflags; int ret; blkg_conf_init(&ctx, input); - ret = blkg_conf_open_bdev(&ctx); - if (ret) + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) goto err; body = ctx.body; @@ -3247,8 +3247,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } - memflags = blk_mq_freeze_queue(disk->queue); - mutex_lock(&disk->queue->elevator_lock); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3348,21 +3346,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); - mutex_unlock(&disk->queue->elevator_lock); - blk_mq_unfreeze_queue(disk->queue, memflags); - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); - blk_mq_unquiesce_queue(disk->queue); - mutex_unlock(&disk->queue->elevator_lock); - blk_mq_unfreeze_queue(disk->queue, memflags); - ret = -EINVAL; err: - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return ret; } -- 2.47.1