Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space sends this with write. Add IOCB_ZONE_APPEND which is set in kiocb->ki_flags on receiving RWF_ZONE_APPEND. Make direct IO submission path use IOCB_ZONE_APPEND to send bio with append op. Direct IO completion returns zone-relative offset, in sector unit, to upper layer using kiocb->ki_complete interface. Report error if zone-append is requested on regular file or on sync kiocb (i.e. one without ki_complete). Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> Signed-off-by: SelvaKumar S <selvakuma.s1@xxxxxxxxxxx> Signed-off-by: Arnav Dawn <a.dawn@xxxxxxxxxxx> Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx> Signed-off-by: Javier Gonzalez <javier.gonz@xxxxxxxxxxx> --- fs/block_dev.c | 28 ++++++++++++++++++++++++---- include/linux/fs.h | 9 +++++++++ include/uapi/linux/fs.h | 5 ++++- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 47860e5..5180268 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) /* avoid the need for a I/O completion work item */ if (iocb->ki_flags & IOCB_DSYNC) op |= REQ_FUA; + + if (iocb->ki_flags & IOCB_ZONE_APPEND) + op |= REQ_OP_ZONE_APPEND; + return op; } @@ -295,6 +299,14 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait) return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); } +static inline long blkdev_bio_end_io_append(struct bio *bio) +{ + sector_t zone_sectors = blk_queue_zone_sectors(bio->bi_disk->queue); + + /* calculate zone relative offset for zone append */ + return bio->bi_iter.bi_sector & (zone_sectors - 1); +} + static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; @@ -307,15 +319,19 @@ static void blkdev_bio_end_io(struct bio *bio) if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; + long res2 = 0; if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; + + if (iocb->ki_flags & IOCB_ZONE_APPEND) + res2 = blkdev_bio_end_io_append(bio); } else { ret = blk_status_to_errno(dio->bio.bi_status); } - dio->iocb->ki_complete(iocb, ret, 0); + dio->iocb->ki_complete(iocb, ret, res2); if (dio->multi_bio) bio_put(&dio->bio); } else { @@ -382,6 +398,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; + bio->bi_opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { @@ -391,11 +408,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } if (is_read) { - bio->bi_opf = REQ_OP_READ; if (dio->should_dirty) bio_set_pages_dirty(bio); } else { - bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } @@ -465,12 +480,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + bool is_sync = is_sync_kiocb(iocb); int nr_pages; + /* zone-append is supported only on async-kiocb */ + if (is_sync && iocb->ki_flags & IOCB_ZONE_APPEND) + return -EINVAL; + nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); if (!nr_pages) return 0; - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) + if (is_sync && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c4ab4d..3202d9a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +#define IOCB_ZONE_APPEND (1 << 8) struct kiocb { struct file *ki_filp; @@ -3456,6 +3457,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); if (flags & RWF_APPEND) ki->ki_flags |= IOCB_APPEND; + if (flags & RWF_ZONE_APPEND) { + /* currently support block device only */ + umode_t mode = file_inode(ki->ki_filp)->i_mode; + + if (!(S_ISBLK(mode))) + return -EOPNOTSUPP; + ki->ki_flags |= IOCB_ZONE_APPEND; + } return 0; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 379a612..1ce06e9 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -299,8 +299,11 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO O_APPEND */ +#define RWF_ZONE_APPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_ZONE_APPEND) #endif /* _UAPI_LINUX_FS_H */ -- 2.7.4