This adds direct-io atomic writes support in iomap. This adds - 1. IOMAP_ATOMIC flag for iomap iter. 2. Sets REQ_ATOMIC to bio opflags. 3. Adds necessary checks in iomap_dio code to ensure a single bio is submitted for an atomic write request. (since we only support ubuf type iocb). Otherwise return an error EIO. 4. Adds a common helper routine iomap_dio_check_atomic(). It helps in verifying mapped length and start/end physical offset against the hw device constraints for supporting atomic writes. This patch is based on a patch from John Garry <john.g.garry@xxxxxxxxxx> which adds such support of DIO atomic writes to iomap. Co-developed-by: Ojaswin Mujoo <ojaswin@xxxxxxxxxxxxx> Signed-off-by: Ojaswin Mujoo <ojaswin@xxxxxxxxxxxxx> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@xxxxxxxxx> --- fs/iomap/direct-io.c | 75 +++++++++++++++++++++++++++++++++++++++++-- fs/iomap/trace.h | 3 +- include/linux/iomap.h | 1 + 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bcd3f8cf5ea4..b4548acb74e7 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -256,7 +256,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) + const struct iomap *iomap, bool use_fua, bool atomic_write) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -269,6 +269,9 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + if (atomic_write) + opflags |= REQ_ATOMIC; + return opflags; } @@ -279,11 +282,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; loff_t length = iomap_length(iter); + const size_t orig_len = iter->len; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; bool need_zeroout = false; - bool use_fua = false; + bool use_fua = false, atomic_write = iter->flags & IOMAP_ATOMIC; int nr_pages, ret = 0; size_t copied = 0; size_t orig_count; @@ -356,6 +360,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (need_zeroout) { /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); + if (unlikely(pad && atomic_write)) { + WARN_ON_ONCE("pos not atomic write aligned\n"); + ret = -EINVAL; + goto out; + } if (pad) iomap_dio_zero(iter, dio, pos - pad, pad); } @@ -365,7 +374,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * can set up the page vector appropriately for a ZONE_APPEND * operation. */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_write); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -397,6 +406,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; + + /* This bio should have covered the complete length */ + if (unlikely(atomic_write && n != orig_len)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + bio_put(bio); + goto out; + } if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { @@ -429,6 +446,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); + /* This should never happen */ + WARN_ON_ONCE(unlikely(pad && atomic_write)); if (pad) iomap_dio_zero(iter, dio, pos, fs_block_size - pad); } @@ -516,6 +535,44 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter, } } +/* + * iomap_dio_check_atomic: DIO Atomic checks before calling bio submission. + * @iter: iomap iterator + * This function is called after filesystem block mapping and before bio + * formation/submission. This is the right place to verify hw device/block + * layer constraints to be followed for doing atomic writes. Hence do those + * common checks here. + */ +static bool iomap_dio_check_atomic(struct iomap_iter *iter) +{ + struct block_device *bdev = iter->iomap.bdev; + unsigned long long map_len = iomap_length(iter); + unsigned long long start = iomap_sector(&iter->iomap, iter->pos) + << SECTOR_SHIFT; + unsigned long long end = start + map_len - 1; + unsigned int awu_min = + queue_atomic_write_unit_min_bytes(bdev->bd_queue); + unsigned int awu_max = + queue_atomic_write_unit_max_bytes(bdev->bd_queue); + unsigned long boundary = + queue_atomic_write_boundary_bytes(bdev->bd_queue); + unsigned long mask = ~(boundary - 1); + + + /* map_len should be same as user specified iter->len */ + if (map_len < iter->len) + return false; + /* start should be aligned to block device min atomic unit alignment */ + if (!IS_ALIGNED(start, awu_min)) + return false; + /* If top bits doesn't match, means atomic unit boundary is crossed */ + if (boundary && ((start | mask) != (end | mask))) + return false; + + return true; +} + + /* * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO * is being issued as AIO or not. This allows us to optimise pure data writes @@ -554,12 +611,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct blk_plug plug; struct iomap_dio *dio; loff_t ret = 0; + bool atomic_write = iocb->ki_flags & IOCB_ATOMIC; trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); if (!iomi.len) return NULL; + if (atomic_write && !iter_is_ubuf(iter)) + return ERR_PTR(-EINVAL); + dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) return ERR_PTR(-ENOMEM); @@ -605,6 +666,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) dio->flags |= IOMAP_DIO_CALLER_COMP; + if (atomic_write) + iomi.flags |= IOMAP_ATOMIC; + if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -656,6 +720,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { + if (atomic_write && !iomap_dio_check_atomic(&iomi)) { + ret = -EIO; + break; + } + iomi.processed = iomap_dio_iter(&iomi, dio); /* diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index c16fd55f5595..c95576420bca 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 96dd0acbba44..9eac704a0d6f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,6 +178,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ +#define IOMAP_ATOMIC (1 << 9) struct iomap_ops { /* -- 2.43.0