On 2020/10/21 18:04, Sergei Shtepa wrote: > Signed-off-by: Sergei Shtepa <sergei.shtepa@xxxxxxxxx> > --- > block/Kconfig | 11 ++ > block/Makefile | 1 + > block/blk-core.c | 52 +++++-- > block/blk-filter-internal.h | 29 ++++ > block/blk-filter.c | 286 ++++++++++++++++++++++++++++++++++++ > block/partitions/core.c | 14 +- > fs/block_dev.c | 6 +- > fs/direct-io.c | 2 +- > fs/iomap/direct-io.c | 2 +- > include/linux/bio.h | 4 +- > include/linux/blk-filter.h | 76 ++++++++++ > include/linux/genhd.h | 8 +- > kernel/power/swap.c | 2 +- > mm/page_io.c | 4 +- > 14 files changed, 471 insertions(+), 26 deletions(-) > create mode 100644 block/blk-filter-internal.h > create mode 100644 block/blk-filter.c > create mode 100644 include/linux/blk-filter.h > > diff --git a/block/Kconfig b/block/Kconfig > index bbad5e8bbffe..a308801b4376 100644 > --- a/block/Kconfig > +++ b/block/Kconfig > @@ -204,6 +204,17 @@ config BLK_INLINE_ENCRYPTION_FALLBACK > by falling back to the kernel crypto API when inline > encryption hardware is not present. > > +config BLK_FILTER > + bool "Enable support for block layer filters" > + default y > + depends on MODULES > + help > + Enabling this lets third-party kernel modules intercept > + bio requests for any block device. This allows them to implement > + changed block tracking and snapshots without any reconfiguration of > + the existing setup. For example, this option allows snapshotting of > + a block device without adding it to LVM. > + > menu "Partition Types" > > source "block/partitions/Kconfig" > diff --git a/block/Makefile b/block/Makefile > index 8d841f5f986f..b8ee50b8e031 100644 > --- a/block/Makefile > +++ b/block/Makefile > @@ -38,3 +38,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o > obj-$(CONFIG_BLK_PM) += blk-pm.o > obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o > obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o > +obj-$(CONFIG_BLK_FILTER) += blk-filter.o > diff --git a/block/blk-core.c b/block/blk-core.c > index 10c08ac50697..cc06402af695 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1216,23 +1216,20 @@ blk_qc_t submit_bio_noacct(struct bio *bio) > EXPORT_SYMBOL(submit_bio_noacct); Probably best to have this as its own patch last in the series. > > /** > - * submit_bio - submit a bio to the block device layer for I/O > - * @bio: The &struct bio which describes the I/O > - * > - * submit_bio() is used to submit I/O requests to block devices. It is passed a > - * fully set up &struct bio that describes the I/O that needs to be done. The > - * bio will be send to the device described by the bi_disk and bi_partno fields. > + * submit_bio_direct - submit a bio to the block device layer for I/O > + * bypass filter. > + * @bio: The bio describing the location in memory and on the device. > * > - * The success/failure status of the request, along with notification of > - * completion, is delivered asynchronously through the ->bi_end_io() callback > - * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has > - * been called. > + * Description: > + * This is a version of submit_bio() that shall only be used for I/O > + * that cannot be intercepted by block layer filters. > + * All file systems and other upper level users of the block layer > + * should use submit_bio() instead. > + * Use this function to access the swap partition and directly access > + * the block device file. > */ > -blk_qc_t submit_bio(struct bio *bio) > +blk_qc_t submit_bio_direct(struct bio *bio) > { > - if (blkcg_punt_bio_submit(bio)) > - return BLK_QC_T_NONE; > - > /* > * If it's a regular read/write or a barrier with data attached, > * go through the normal accounting stuff before submission. > @@ -1282,8 +1279,35 @@ blk_qc_t submit_bio(struct bio *bio) > > return submit_bio_noacct(bio); > } > +EXPORT_SYMBOL(submit_bio_direct); EXPORT_SYMBOL_GPL > + > +/** > + * submit_bio - submit a bio to the block device layer for I/O > + * @bio: The &struct bio which describes the I/O > + * > + * submit_bio() is used to submit I/O requests to block devices. It is passed a > + * fully set up &struct bio that describes the I/O that needs to be done. The > + * bio will be send to the device described by the bi_disk and bi_partno fields. > + * > + * The success/failure status of the request, along with notification of > + * completion, is delivered asynchronously through the ->bi_end_io() callback > + * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has > + * been called. > + */ > +void submit_bio(struct bio *bio) > +{ > + if (blkcg_punt_bio_submit(bio)) > + return; > + > +#ifdef CONFIG_BLK_FILTER> + blk_filter_submit_bio(bio); > +#else > + submit_bio_direct(bio); > +#endif if (IS_ENABLED(CONFIG_BLK_FILTER)) blk_filter_submit_bio(bio); else submit_bio_direct(bio); is much cleaner... > +} > EXPORT_SYMBOL(submit_bio); > > + > /** > * blk_cloned_rq_check_limits - Helper function to check a cloned request > * for the new queue limits The remaining should probably be a different patch before the above change. > diff --git a/block/blk-filter-internal.h b/block/blk-filter-internal.h > new file mode 100644 > index 000000000000..d456a09f50db > --- /dev/null > +++ b/block/blk-filter-internal.h > @@ -0,0 +1,29 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +/* > + * > + * Block device filters internal declarations > + */ > + > +#ifndef BLK_FILTER_INTERNAL_H > +#define BLK_FILTER_INTERNAL_H > + > +#ifdef CONFIG_BLK_FILTER > +#include <linux/blk-filter.h> > + > +void blk_filter_part_add(struct hd_struct *part, dev_t devt); > + > +void blk_filter_part_del(struct hd_struct *part); > + > +#else /* CONFIG_BLK_FILTER */ > + > + double blank line > +static inline void blk_filter_part_add(struct hd_struct *part, dev_t devt) > +{ }; > + > +static inline void blk_filter_part_del(struct hd_struct *part) > +{ }; > + > +#endif /* CONFIG_BLK_FILTER */ > + > +#endif > diff --git a/block/blk-filter.c b/block/blk-filter.c > new file mode 100644 > index 000000000000..f6de16c45a16 > --- /dev/null > +++ b/block/blk-filter.c > @@ -0,0 +1,286 @@ > +// SPDX-License-Identifier: GPL-2.0 > + > +#include <linux/genhd.h> > +#include <linux/bio.h> > +#include <linux/blkdev.h> > +#include "blk-filter-internal.h" > +#include <linux/rwsem.h> > + > + Again here > +LIST_HEAD(filters); > +DECLARE_RWSEM(filters_lock); > + > +static void blk_filter_release(struct kref *kref) > +{ > + struct blk_filter *flt = container_of(kref, struct blk_filter, kref); > + > + kfree(flt); > +} > + > +static inline void blk_filter_get(struct blk_filter *flt) > +{ > + kref_get(&flt->kref); > +} > + > +static inline void blk_filter_put(struct blk_filter *flt) > +{ > + kref_put(&flt->kref, blk_filter_release); > +} > + > + > +/** > + * blk_filter_part_add() - Notify filters when a new partition is added. > + * @part: The partition for new block device. > + * @devt: Device id for new block device. > + * > + * Description: > + * When the block device is appears in the system, call the filter > + * callback to notify that the block device appears. > + */ > +void blk_filter_part_add(struct hd_struct *part, dev_t devt) > +{ > + down_read(&filters_lock); > + if (!list_empty(&filters)) { > + struct list_head *_list_head; > + > + list_for_each(_list_head, &filters) { > + void *filter_data; > + bool attached = false; > + struct blk_filter *flt; > + > + flt = list_entry(_list_head, struct blk_filter, link); > + > + attached = flt->ops->part_add(devt, &filter_data); > + if (attached) { > + blk_filter_get(flt); > + part->filter = flt; > + part->filter_data = filter_data; > + break; > + } > + } > + } > + up_read(&filters_lock); > +} > + > +/** > + * blk_filter_part_del() - Notify filters when the partition is deleted. > + * @part: The partition of block device. > + * > + * Description: > + * When the block device is destroying and the partition is releasing, > + * call the filter callback to notify that the block device will be > + * deleted. > + */ > +void blk_filter_part_del(struct hd_struct *part) > +{ > + struct blk_filter *flt = part->filter; > + > + if (!flt) > + return; > + > + flt->ops->part_del(part->filter_data); > + > + part->filter_data = NULL; > + part->filter = NULL; > + blk_filter_put(flt); > +} > + > + > +/** > + * blk_filter_submit_bio() - Send new bio to filters for processing. > + * @bio: The new bio for block I/O layer. > + * > + * Description: > + * This function is an implementation of block layer filter > + * interception. If the filter is attached to this block device, > + * then bio will be redirected to the filter kernel module. > + */ > +void blk_filter_submit_bio(struct bio *bio) > +{ > + bool intercepted = false; > + struct hd_struct *part; > + > + bio_get(bio); > + > + part = disk_get_part(bio->bi_disk, bio->bi_partno); > + if (unlikely(!part)) { > + bio->bi_status = BLK_STS_IOERR; > + bio_endio(bio); > + > + bio_put(bio); > + return; > + } > + > + down_read(&part->filter_rw_lockup); > + > + if (part->filter) > + intercepted = part->filter->ops->filter_bio(bio, part->filter_data); > + > + up_read(&part->filter_rw_lockup); > + > + if (!intercepted) > + submit_bio_direct(bio); > + > + disk_put_part(part); > + > + bio_put(bio); > +} > +EXPORT_SYMBOL(blk_filter_submit_bio); > + > +/** > + * blk_filter_register() - Register block layer filter. > + * @ops: New filter callbacks. > + * > + * Return: > + * Filter ID, a pointer to the service structure of the filter. > + * > + * Description: > + * Create new filter structure. > + * Use blk_filter_attach to attach devices to filter. > + */ > +void *blk_filter_register(struct blk_filter_ops *ops) > +{ > + struct blk_filter *flt; > + > + flt = kzalloc(sizeof(struct blk_filter), GFP_KERNEL); > + if (!flt) > + return NULL; > + > + kref_init(&flt->kref); > + flt->ops = ops; > + > + down_write(&filters_lock); > + list_add_tail(&flt->link, &filters); > + up_write(&filters_lock); > + > + return flt; > +} > +EXPORT_SYMBOL(blk_filter_register); > + > +/** > + * blk_filter_unregister() - Unregister block layer filter. > + * @filter: filter identifier. > + * > + * Description: > + * Before call blk_filter_unregister() and unload filter module all > + * partitions MUST be detached. Otherwise, the system will have a > + * filter with non-existent interception functions. > + */ > +void blk_filter_unregister(void *filter) > +{ > + struct blk_filter *flt = filter; > + > + down_write(&filters_lock); > + list_del(&flt->link); > + up_write(&filters_lock); > + > + blk_filter_put(flt); > +} > +EXPORT_SYMBOL(blk_filter_unregister); > + > +/** > + * blk_filter_attach() - Attach block layer filter. > + * @devt: The block device identification number. > + * @filter: Filter identifier. > + * @filter_data: Specific filters data for this device. > + * > + * Return: > + * Return code. > + * -ENODEV - cannot find this device, it is OK if the device does not exist yet. > + * -EALREADY - this device is already attached to this filter. > + * -EBUSY - this device is already attached to the another filter. > + * > + * Description: > + * Attach the device to the block layer filter. > + * Only one filter can be attached to a single device. > + */ > +int blk_filter_attach(dev_t devt, void *filter, void *filter_data) > +{ > + int ret = 0; > + struct blk_filter *flt = filter; > + struct block_device *blk_dev; > + > + > + blk_dev = bdget(devt); > + if (!blk_dev) > + return -ENODEV; > + > + blk_filter_freeze(blk_dev); > + > + if (blk_dev->bd_part->filter) { > + if (blk_dev->bd_part->filter == flt) > + ret = -EALREADY; > + else > + ret = -EBUSY; > + } else { > + blk_filter_get(flt); > + blk_dev->bd_part->filter = flt; > + blk_dev->bd_part->filter_data = filter_data; > + } > + > + blk_filter_thaw(blk_dev); > + > + bdput(blk_dev); > + > + return ret; > +} > +EXPORT_SYMBOL(blk_filter_attach); > + > +/** > + * blk_filter_detach() - Detach block layer filter. > + * @devt: The block device identification number. > + * > + * Description: > + * Detach the device from the block layer filter. > + * Do not forget detach all devices before calling the > + * blk_filter_unregister() function and unload the module! > + */ > +void blk_filter_detach(dev_t devt) > +{ > + struct blk_filter *flt; > + struct block_device *blk_dev; > + > + blk_dev = bdget(devt); > + if (!blk_dev) > + return; > + > + blk_filter_freeze(blk_dev); > + > + flt = blk_dev->bd_part->filter; > + if (flt) { > + blk_dev->bd_part->filter_data = NULL; > + blk_dev->bd_part->filter = NULL; > + blk_filter_put(flt); > + } > + > + blk_filter_thaw(blk_dev); > + > + bdput(blk_dev); > +} > +EXPORT_SYMBOL(blk_filter_detach); All the EXPORT_SYMBOL should probably be EXPORT_SYMBOL_GPL. > + > +/** > + * blk_filter_freeze() - Lock bio submitting. > + * @bdev: The block device pointer. > + * > + * Description: > + * Stop bio processing. > + */ > +void blk_filter_freeze(struct block_device *bdev) > +{ > + down_write(&bdev->bd_part->filter_rw_lockup); > +} > +EXPORT_SYMBOL(blk_filter_freeze); > + > +/** > + * blk_filter_thaw() - Unlock bio submitting. > + * @bdev: The block device pointer. > + * > + * Description: > + * Resume bio processing. > + */ > +void blk_filter_thaw(struct block_device *bdev) > +{ > + up_write(&bdev->bd_part->filter_rw_lockup); > +} > +EXPORT_SYMBOL(blk_filter_thaw); > diff --git a/block/partitions/core.c b/block/partitions/core.c > index 722406b841df..6b845e98b9a1 100644 > --- a/block/partitions/core.c > +++ b/block/partitions/core.c > @@ -11,6 +11,7 @@ > #include <linux/blktrace_api.h> > #include <linux/raid/detect.h> > #include "check.h" > +#include "../blk-filter-internal.h" > > static int (*check_part[])(struct parsed_partitions *) = { > /* > @@ -320,9 +321,11 @@ int hd_ref_init(struct hd_struct *part) > */ > void delete_partition(struct gendisk *disk, struct hd_struct *part) > { > - struct disk_part_tbl *ptbl = > - rcu_dereference_protected(disk->part_tbl, 1); > + struct disk_part_tbl *ptbl; > + > + blk_filter_part_del(part); > > + ptbl = rcu_dereference_protected(disk->part_tbl, 1); > /* > * ->part_tbl is referenced in this part's release handler, so > * we have to hold the disk device > @@ -412,6 +415,9 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, > p->nr_sects = len; > p->partno = partno; > p->policy = get_disk_ro(disk); > +#ifdef CONFIG_BLK_FILTER > + init_rwsem(&p->filter_rw_lockup); > +#endif > > if (info) { > struct partition_meta_info *pinfo; > @@ -469,6 +475,9 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, > /* everything is up and running, commence */ > rcu_assign_pointer(ptbl->part[partno], p); > > + /*inform filter about a new partition*/ > + blk_filter_part_add(p, devt); > + > /* suppress uevent if the disk suppresses it */ > if (!dev_get_uevent_suppress(ddev)) > kobject_uevent(&pdev->kobj, KOBJ_ADD); > @@ -552,6 +561,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) > goto out_unlock; > > sync_blockdev(bdevp); > + > invalidate_bdev(bdevp); > > delete_partition(bdev->bd_disk, part); > diff --git a/fs/block_dev.c b/fs/block_dev.c > index 8ae833e00443..431eae17fd8f 100644 > --- a/fs/block_dev.c > +++ b/fs/block_dev.c > @@ -237,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, > if (iocb->ki_flags & IOCB_HIPRI) > bio_set_polled(&bio, iocb); > > - qc = submit_bio(&bio); > + qc = submit_bio_direct(&bio); > for (;;) { > set_current_state(TASK_UNINTERRUPTIBLE); > if (!READ_ONCE(bio.bi_private)) > @@ -400,7 +400,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) > polled = true; > } > > - qc = submit_bio(bio); > + qc = submit_bio_direct(bio); > > if (polled) > WRITE_ONCE(iocb->ki_cookie, qc); > @@ -421,7 +421,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) > atomic_inc(&dio->ref); > } > > - submit_bio(bio); > + submit_bio_direct(bio); > bio = bio_alloc(GFP_KERNEL, nr_pages); > } > > diff --git a/fs/direct-io.c b/fs/direct-io.c > index 183299892465..d9bb1b6f6814 100644 > --- a/fs/direct-io.c > +++ b/fs/direct-io.c > @@ -459,7 +459,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) > sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); > dio->bio_cookie = BLK_QC_T_NONE; > } else > - dio->bio_cookie = submit_bio(bio); > + dio->bio_cookie = submit_bio_direct(bio); All these changes are unnecessary if you reverse things: submit_bio() is kept as the direct version (as today) and you use a "submit_bio_filtered()" where needed. > > sdio->bio = NULL; > sdio->boundary = 0; > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c > index c1aafb2ab990..e05f20ce8b5f 100644 > --- a/fs/iomap/direct-io.c > +++ b/fs/iomap/direct-io.c > @@ -73,7 +73,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, > file_inode(dio->iocb->ki_filp), > iomap, bio, pos); > else > - dio->submit.cookie = submit_bio(bio); > + dio->submit.cookie = submit_bio_direct(bio); > } > > static ssize_t iomap_dio_complete(struct iomap_dio *dio) > diff --git a/include/linux/bio.h b/include/linux/bio.h > index c6d765382926..5b0a32697207 100644 > --- a/include/linux/bio.h > +++ b/include/linux/bio.h > @@ -10,6 +10,7 @@ > #include <linux/ioprio.h> > /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ > #include <linux/blk_types.h> > +#include <linux/blk-filter.h> > > #define BIO_DEBUG > > @@ -411,7 +412,8 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) > return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); > } > > -extern blk_qc_t submit_bio(struct bio *); > +extern blk_qc_t submit_bio_direct(struct bio *bio); > +extern void submit_bio(struct bio *bio); > > extern void bio_endio(struct bio *); > > diff --git a/include/linux/blk-filter.h b/include/linux/blk-filter.h > new file mode 100644 > index 000000000000..f3e79e5b4586 > --- /dev/null > +++ b/include/linux/blk-filter.h > @@ -0,0 +1,76 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +/* > + * API declarations for kernel modules utilizing block device filters > + */ > + > +#ifndef BLK_FILTER_H > +#define BLK_FILTER_H > + > +#ifdef CONFIG_BLK_FILTER > +#include <linux/kref.h> > + > +struct blk_filter_ops { > + /* > + * Intercept bio callback. > + * > + * Returns true if the request was intercepted and placed in the > + * queue for processing. Otherwise submit_bio_direct() calling > + * needed. > + */ > + bool (*filter_bio)(struct bio *bio, void *filter_data); > + > + /* > + * Callback to a request to add block device to the filter. > + * > + * Returns true if the block device will be filtered. > + * p_filter_data gets a pointer to data that is unique to > + * this device. > + */ > + bool (*part_add)(dev_t devt, void **p_filter_data); > + > + /* > + * Callback to remove block device from the filter. > + */ > + void (*part_del)(void *filter_data); > +}; > + > +struct blk_filter { > + struct list_head link; > + struct kref kref; > + struct blk_filter_ops *ops; > +}; > + > +/* > + * Register/unregister device to filter > + */ > +void *blk_filter_register(struct blk_filter_ops *ops); > + > +void blk_filter_unregister(void *filter); > + > +/* > + * Attach/detach device to filter > + */ > +int blk_filter_attach(dev_t devt, void *filter, void *filter_data); > + > +void blk_filter_detach(dev_t devt); > + > +/* > + * For a consistent state of the file system use the freeze_bdev/thaw_bdav. > + * But in addition, to ensure that the filter is not in the state of > + * intercepting the next BIO, you need to call black_filter_freeze/blk_filter_thaw. > + * This is especially actual if there is no file system on the disk. > + */ > + > +void blk_filter_freeze(struct block_device *bdev); > + > +void blk_filter_thaw(struct block_device *bdev); > + > +/* > + * Filters intercept function > + */ > +void blk_filter_submit_bio(struct bio *bio); > + > +#endif /* CONFIG_BLK_FILTER */ > + > +#endif > diff --git a/include/linux/genhd.h b/include/linux/genhd.h > index 4ab853461dff..514fab6b947e 100644 > --- a/include/linux/genhd.h > +++ b/include/linux/genhd.h > @@ -4,7 +4,7 @@ > > /* > * genhd.h Copyright (C) 1992 Drew Eckhardt > - * Generic hard disk header file by > + * Generic hard disk header file by > * Drew Eckhardt > * > * <drew@xxxxxxxxxxxx> > @@ -75,6 +75,12 @@ struct hd_struct { > int make_it_fail; > #endif > struct rcu_work rcu_work; > + > +#ifdef CONFIG_BLK_FILTER > + struct rw_semaphore filter_rw_lockup; /* for freezing block device*/ > + struct blk_filter *filter; /* block layer filter*/ > + void *filter_data; /*specific for each block device filters data*/ > +#endif > }; > > /** > diff --git a/kernel/power/swap.c b/kernel/power/swap.c > index 01e2858b5fe3..5287346b87a1 100644 > --- a/kernel/power/swap.c > +++ b/kernel/power/swap.c > @@ -283,7 +283,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, > bio->bi_end_io = hib_end_io; > bio->bi_private = hb; > atomic_inc(&hb->count); > - submit_bio(bio); > + submit_bio_direct(bio); > } else { > error = submit_bio_wait(bio); > bio_put(bio); > diff --git a/mm/page_io.c b/mm/page_io.c > index e485a6e8a6cd..4540426400b3 100644 > --- a/mm/page_io.c > +++ b/mm/page_io.c > @@ -362,7 +362,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, > count_swpout_vm_event(page); > set_page_writeback(page); > unlock_page(page); > - submit_bio(bio); > + submit_bio_direct(bio); > out: > return ret; > } > @@ -434,7 +434,7 @@ int swap_readpage(struct page *page, bool synchronous) > } > count_vm_event(PSWPIN); > bio_get(bio); > - qc = submit_bio(bio); > + qc = submit_bio_direct(bio); > while (synchronous) { > set_current_state(TASK_UNINTERRUPTIBLE); > if (!READ_ONCE(bio->bi_private)) > Separate into multiple patches: one that introduces the filter functions/ops code and another that changes the block layer where needed. -- Damien Le Moal Western Digital Research