On 2021/01/29 2:23, Sergei Shtepa wrote: > The block layer interposer allows to intercept bio requests. > This allows to connect device mapper and other kernel modules > to the block device stack on the fly. > > changes: > * new BIO_INTERPOSED bio flag. > * new function __submit_bio_interposed() implements the interposers > logic. > * new function blk_mq_is_queue_frozen() allow to assert that > the queue is frozen. > * functions blk_interposer_attach() and blk_interposer_detach() > allow to attach and detach interposers. The changelog should not be part of the commit message. If you need a changelog for a single patch, add it between the commit message end "---" and the patch stats. git will ignore that part. > Signed-off-by: Sergei Shtepa <sergei.shtepa@xxxxxxxxx> > --- > block/bio.c | 2 + > block/blk-core.c | 29 ++++++++++++++ > block/blk-mq.c | 13 +++++++ > block/genhd.c | 82 +++++++++++++++++++++++++++++++++++++++ > include/linux/blk-mq.h | 1 + > include/linux/blk_types.h | 6 ++- > include/linux/genhd.h | 19 +++++++++ > 7 files changed, 150 insertions(+), 2 deletions(-) > > diff --git a/block/bio.c b/block/bio.c > index 1f2cc1fbe283..f6f135eb84b5 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -684,6 +684,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) > bio_set_flag(bio, BIO_CLONED); > if (bio_flagged(bio_src, BIO_THROTTLED)) > bio_set_flag(bio, BIO_THROTTLED); > + if (bio_flagged(bio_src, BIO_INTERPOSED)) > + bio_set_flag(bio, BIO_INTERPOSED); > bio->bi_opf = bio_src->bi_opf; > bio->bi_ioprio = bio_src->bi_ioprio; > bio->bi_write_hint = bio_src->bi_write_hint; > diff --git a/block/blk-core.c b/block/blk-core.c > index 7663a9b94b80..07ec82d8fe57 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1032,6 +1032,32 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) > return ret; > } > > +static blk_qc_t __submit_bio_interposed(struct bio *bio) > +{ > + struct bio_list bio_list[2] = { }; > + blk_qc_t ret = BLK_QC_T_NONE; > + > + current->bio_list = bio_list; > + if (likely(bio_queue_enter(bio) == 0)) { > + struct gendisk *disk = bio->bi_disk; > + > + bio_set_flag(bio, BIO_INTERPOSED); > + if (likely(blk_has_interposer(disk))) > + disk->interposer->ip_submit_bio(bio); Why do you check again blk_has_interposer() here ? That is checked already in submit_bio_noacct() and the interposer attach/detach cannot run without the queue frozen. So I do not think you need to check again. If you do, then you definitely have a race condition here. > + else /* interposer was removed */ > + bio_list_add(¤t->bio_list[0], bio); > + > + blk_queue_exit(disk->queue); > + } > + current->bio_list = NULL; > + > + /* Resubmit remaining bios */ > + while ((bio = bio_list_pop(&bio_list[0]))) > + ret = submit_bio_noacct(bio); > + > + return ret; > +} > + > /** > * submit_bio_noacct - re-submit a bio to the block device layer for I/O > * @bio: The bio describing the location in memory and on the device. > @@ -1057,6 +1083,9 @@ blk_qc_t submit_bio_noacct(struct bio *bio) > return BLK_QC_T_NONE; > } > > + if (blk_has_interposer(bio->bi_disk) && > + !bio_flagged(bio, BIO_INTERPOSED)) > + return __submit_bio_interposed(bio); OK. I *think* that this is to handle stacked devices, right ? Otherwise, this condition does not make much sense. Why not just: if (blk_has_interposer(bio->bi_disk)) return __submit_bio_interposed(bio); So at least adding some comments here may be good. > if (!bio->bi_disk->fops->submit_bio) > return __submit_bio_noacct_mq(bio); > return __submit_bio_noacct(bio); > diff --git a/block/blk-mq.c b/block/blk-mq.c > index f285a9123a8b..924ec26fae5f 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -161,6 +161,19 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, > } > EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); > > + > +bool blk_mq_is_queue_frozen(struct request_queue *q) > +{ > + bool ret; > + > + mutex_lock(&q->mq_freeze_lock); > + ret = percpu_ref_is_dying(&q->q_usage_counter) && percpu_ref_is_zero(&q->q_usage_counter); > + mutex_unlock(&q->mq_freeze_lock); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(blk_mq_is_queue_frozen); Maybe move this change to its own patch preceding this one ? > + > /* > * Guarantee no request is in use, so we can change any data structure of > * the queue afterward. > diff --git a/block/genhd.c b/block/genhd.c > index 419548e92d82..d3459582f56c 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -30,6 +30,7 @@ > static struct kobject *block_depr; > > DECLARE_RWSEM(bdev_lookup_sem); > +DEFINE_MUTEX(bdev_interposer_mutex); > > /* for extended dynamic devt allocation, currently only one major is used */ > #define NR_EXT_DEVT (1 << MINORBITS) > @@ -2148,3 +2149,84 @@ static void disk_release_events(struct gendisk *disk) > WARN_ON_ONCE(disk->ev && disk->ev->block != 1); > kfree(disk->ev); > } > + > +/** > + * blk_interposer_attach - Attach interposer to disk > + * @disk: target disk > + * @interposer: block device interposer > + * @ip_submit_bio: hook for submit_bio() > + * > + * Returns: > + * -EINVAL if @interposer is NULL. > + * -EPERM if queue is not frozen. > + * -EBUSY if the block device already has @interposer. > + * -EALREADY if the block device already has @interposer with same callback. > + * > + * Disk must be frozen by blk_mq_freeze_queue(). > + */ > +int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer, > + const ip_submit_bio_t ip_submit_bio) > +{ > + int ret = 0; > + > + if (!interposer) > + return -EINVAL; Is this really necessary ? If some user of this function has interposer == NULL, that caller needs debugging... > + > + if (!blk_mq_is_queue_frozen(disk->queue)) > + return -EPERM; Why not do the queue freeze here ? > + > + mutex_lock(&bdev_interposer_mutex); > + if (blk_has_interposer(disk)) { > + if (disk->interposer->ip_submit_bio == ip_submit_bio) > + ret = -EALREADY; > + else > + ret = -EBUSY; > + goto out; > + } > + > + interposer->ip_submit_bio = ip_submit_bio; > + interposer->disk = disk; > + > + disk->interposer = interposer; > +out: > + mutex_unlock(&bdev_interposer_mutex); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(blk_interposer_attach); > + > +/** > + * blk_interposer_detach - Detach interposer from disk > + * @interposer: block device interposer > + * @ip_submit_bio: hook for submit_bio() > + * > + * Disk must be frozen by blk_mq_freeze_queue(). > + */ > +void blk_interposer_detach(struct blk_interposer *interposer, > + const ip_submit_bio_t ip_submit_bio) > +{ The interface is weird. Why not passing the gendisk ? > + struct gendisk *disk; > + > + if (WARN_ON(!interposer)) > + return; Same comment as above. This should not be necessary. > + > + mutex_lock(&bdev_interposer_mutex); > + > + /* Check if the interposer is still active. */ > + disk = interposer->disk; > + if (WARN_ON(!disk)) > + goto out; > + > + if (WARN_ON(!blk_mq_is_queue_frozen(disk->queue))) > + goto out; > + > + /* Check if it is really our interposer. */ > + if (WARN_ON(disk->interposer->ip_submit_bio != ip_submit_bio)) > + goto out; > + > + disk->interposer = NULL; > + interposer->disk = NULL; > +out: > + mutex_unlock(&bdev_interposer_mutex); > +} > +EXPORT_SYMBOL_GPL(blk_interposer_detach); > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index d705b174d346..9d1e8c4e922e 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -525,6 +525,7 @@ void blk_freeze_queue_start(struct request_queue *q); > void blk_mq_freeze_queue_wait(struct request_queue *q); > int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, > unsigned long timeout); > +bool blk_mq_is_queue_frozen(struct request_queue *q); > > int blk_mq_map_queues(struct blk_mq_queue_map *qmap); > void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index 866f74261b3b..6c1351d7b73f 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -227,7 +227,7 @@ struct bio { > * top bits REQ_OP. Use > * accessors. > */ > - unsigned short bi_flags; /* status, etc and bvec pool number */ > + unsigned int bi_flags; /* status, etc and bvec pool number */ > unsigned short bi_ioprio; > unsigned short bi_write_hint; > blk_status_t bi_status; > @@ -304,6 +304,8 @@ enum { > * of this bio. */ > BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ > BIO_TRACKED, /* set if bio goes through the rq_qos path */ > + BIO_INTERPOSED, /* bio has been interposed and can be moved to > + * a different disk */ > BIO_FLAG_LAST > }; > > @@ -322,7 +324,7 @@ enum { > * freed. > */ > #define BVEC_POOL_BITS (3) > -#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) > +#define BVEC_POOL_OFFSET (32 - BVEC_POOL_BITS) > #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) > #if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) > # error "BVEC_POOL_BITS is too small" > diff --git a/include/linux/genhd.h b/include/linux/genhd.h > index 809aaa32d53c..8094af3a3db9 100644 > --- a/include/linux/genhd.h > +++ b/include/linux/genhd.h > @@ -134,6 +134,14 @@ struct blk_integrity { > unsigned char tag_size; > }; > > +struct blk_interposer; This is not needed. > +typedef void (*ip_submit_bio_t) (struct bio *bio); This hides the definition of the submit callback for no good reasons that I can see. Since the callback has a simple interface, I would prefer this to be dropped. > + > +struct blk_interposer { > + ip_submit_bio_t ip_submit_bio; > + struct gendisk *disk; If you fix the interface of the detach function, you should not need this field. > +}; > + > struct gendisk { > /* major, first_minor and minors are input parameters only, > * don't use directly. Use disk_devt() and disk_max_parts(). > @@ -158,6 +166,7 @@ struct gendisk { > > const struct block_device_operations *fops; > struct request_queue *queue; > + struct blk_interposer *interposer; > void *private_data; > > int flags; > @@ -346,4 +355,14 @@ static inline void printk_all_partitions(void) > } > #endif /* CONFIG_BLOCK */ > > +/* > + * block layer interposer > + */ > +#define blk_has_interposer(d) ((d)->interposer != NULL) Please make this an inline function. > + > +int blk_interposer_attach(struct gendisk *disk, struct blk_interposer *interposer, > + const ip_submit_bio_t ip_submit_bio); > +void blk_interposer_detach(struct blk_interposer *interposer, > + const ip_submit_bio_t ip_submit_bio); > + > #endif /* _LINUX_GENHD_H */ > -- Damien Le Moal Western Digital Research -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel