To easily track filesystem from a pmem device, we introduce a holder for dax_device structure, and also its operation. This holder is used to remember who is using this dax_device: - When it is the backend of a filesystem, the holder will be the instance of this filesystem. - When this pmem device is one of the targets in a mapped device, the holder will be this mapped device. In this case, the mapped device has its own dax_device and it will follow the first rule. So that we can finally track to the filesystem we needed. The holder and holder_ops will be set when filesystem is being mounted, or an target device is being activated. Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx> Reviewed-by: Christoph Hellwig <hch@xxxxxx> Reviewed-by: Dan Williams <dan.j.wiliams@xxxxxxxxx> Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- drivers/dax/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++- drivers/md/dm.c | 2 +- fs/erofs/super.c | 10 ++++--- fs/ext2/super.c | 7 +++-- fs/ext4/super.c | 9 +++--- fs/xfs/xfs_buf.c | 5 ++-- include/linux/dax.h | 33 ++++++++++++++++------ 7 files changed, 110 insertions(+), 23 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 50a08b2ec247..9b5e2a5eb0ae 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -22,6 +22,8 @@ * @private: dax driver private data * @flags: state and boolean properties * @ops: operations for this device + * @holder_data: holder of a dax_device: could be filesystem or mapped device + * @holder_ops: operations for the inner holder */ struct dax_device { struct inode inode; @@ -29,6 +31,8 @@ struct dax_device { void *private; unsigned long flags; const struct dax_operations *ops; + void *holder_data; + const struct dax_holder_operations *holder_ops; }; static dev_t dax_devt; @@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host); * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax * @bdev: block device to find a dax_device for * @start_off: returns the byte offset into the dax_device that @bdev starts + * @holder: filesystem or mapped device inside the dax_device + * @ops: operations for the inner holder */ -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops) { struct dax_device *dax_dev; u64 part_size; @@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) dax_dev = NULL; + else if (holder) { + if (!cmpxchg(&dax_dev->holder_data, NULL, holder)) + dax_dev->holder_ops = ops; + else + dax_dev = NULL; + } dax_read_unlock(id); return dax_dev; } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); + +void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ + if (dax_dev && holder && + cmpxchg(&dax_dev->holder_data, holder, NULL) == holder) + dax_dev->holder_ops = NULL; + put_dax(dax_dev); +} +EXPORT_SYMBOL_GPL(fs_put_dax); #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ enum dax_device_flags { @@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_recovery_write); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, + u64 len, int mf_flags) +{ + int rc, id; + + id = dax_read_lock(); + if (!dax_alive(dax_dev)) { + rc = -ENXIO; + goto out; + } + + if (!dax_dev->holder_ops) { + rc = -EOPNOTSUPP; + goto out; + } + + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); +out: + dax_read_unlock(id); + return rc; +} +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) @@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev) if (!dax_dev) return; + if (dax_dev->holder_data != NULL) + dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); + + /* clear holder data */ + dax_dev->holder_ops = NULL; + dax_dev->holder_data = NULL; } EXPORT_SYMBOL_GPL(kill_dax); @@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(put_dax); +/** + * dax_holder() - obtain the holder of a dax device + * @dax_dev: a dax_device instance + + * Return: the holder's data which represents the holder if registered, + * otherwize NULL. + */ +void *dax_holder(struct dax_device *dax_dev) +{ + return dax_dev->holder_data; +} +EXPORT_SYMBOL_GPL(dax_holder); + /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dfb0a551bd88..3de8167a3905 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -760,7 +760,7 @@ static int open_table_device(struct table_device *td, dev_t dev, } td->dm_dev.bdev = bdev; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); return 0; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 95addc5c9d34..3173debeaa5a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, + NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off); + &sbi->dax_part_off, + NULL, NULL); } err = erofs_read_superblock(sb); @@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) { struct erofs_device_info *dif = ptr; - fs_put_dax(dif->dax_dev); + fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); erofs_fscache_unregister_cookie(&dif->fscache); @@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb) return; erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev); + fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..4638946251b9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb) brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); spin_lock_init(&sbi->s_lock); ret = -EINVAL; @@ -1204,7 +1205,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) failed_mount: brelse(bh); failed_sbi: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 450c918d68fc..0e91243b9616 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -4262,7 +4262,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi) return; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -4274,7 +4274,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) if (!sbi) return NULL; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); @@ -4286,7 +4287,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) sbi->s_sb = sb; return sbi; err_out: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4aa9c9cf5b6e..1ec2a7b6d44e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1911,7 +1911,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); - fs_put_dax(btp->bt_daxdev); + fs_put_dax(btp->bt_daxdev, NULL); kmem_free(btp); } @@ -1964,7 +1964,8 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL, + NULL); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/include/linux/dax.h b/include/linux/dax.h index e7b81634c52a..cf85fc36da5f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -43,8 +43,21 @@ struct dax_operations { void *addr, size_t bytes, struct iov_iter *iter); }; +struct dax_holder_operations { + /* + * notify_failure - notify memory failure into inner holder device + * @dax_dev: the dax device which contains the holder + * @offset: offset on this dax device where memory failure occurs + * @len: length of this memory failure event + * @flags: action flags for memory failure handler + */ + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, + u64 len, int mf_flags); +}; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); +void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else +static inline void *dax_holder(struct dax_device *dax_dev) +{ + return NULL; +} static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { @@ -114,12 +131,9 @@ struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off); -static inline void fs_put_dax(struct dax_device *dax_dev) -{ - put_dax(dax_dev); -} +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops); +void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off) + u64 *start_off, void *holder, + const struct dax_holder_operations *ops) { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev) +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ @@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, + int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, -- 2.36.1