On 2020/06/26 2:18, Kanchan Joshi wrote: > Introduce RWF_ZONE_APPEND flag to represent zone-append. User-space > sends this with write. Add IOCB_ZONE_APPEND which is set in > kiocb->ki_flags on receiving RWF_ZONE_APPEND. > Make direct IO submission path use IOCB_ZONE_APPEND to send bio with > append op. Direct IO completion returns zone-relative offset, in sector > unit, to upper layer using kiocb->ki_complete interface. > Report error if zone-append is requested on regular file or on sync > kiocb (i.e. one without ki_complete). > > Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> > Signed-off-by: SelvaKumar S <selvakuma.s1@xxxxxxxxxxx> > Signed-off-by: Arnav Dawn <a.dawn@xxxxxxxxxxx> > Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx> > Signed-off-by: Javier Gonzalez <javier.gonz@xxxxxxxxxxx> > --- > fs/block_dev.c | 28 ++++++++++++++++++++++++---- > include/linux/fs.h | 9 +++++++++ > include/uapi/linux/fs.h | 5 ++++- > 3 files changed, 37 insertions(+), 5 deletions(-) > > diff --git a/fs/block_dev.c b/fs/block_dev.c > index 47860e5..5180268 100644 > --- a/fs/block_dev.c > +++ b/fs/block_dev.c > @@ -185,6 +185,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) > /* avoid the need for a I/O completion work item */ > if (iocb->ki_flags & IOCB_DSYNC) > op |= REQ_FUA; > + > + if (iocb->ki_flags & IOCB_ZONE_APPEND) > + op |= REQ_OP_ZONE_APPEND; This is wrong. REQ_OP_WRITE is already set in the declaration of "op". How can this work ? > + > return op; > } > > @@ -295,6 +299,14 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait) > return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); > } > > +static inline long blkdev_bio_end_io_append(struct bio *bio) > +{ > + sector_t zone_sectors = blk_queue_zone_sectors(bio->bi_disk->queue); > + > + /* calculate zone relative offset for zone append */ The name of the function may be better spelling out zone_append instead of just append. But see next comment. > + return bio->bi_iter.bi_sector & (zone_sectors - 1); > +} > + > static void blkdev_bio_end_io(struct bio *bio) > { > struct blkdev_dio *dio = bio->bi_private; > @@ -307,15 +319,19 @@ static void blkdev_bio_end_io(struct bio *bio) > if (!dio->is_sync) { > struct kiocb *iocb = dio->iocb; > ssize_t ret; > + long res2 = 0; > > if (likely(!dio->bio.bi_status)) { > ret = dio->size; > iocb->ki_pos += ret; > + Blank line not needed. > + if (iocb->ki_flags & IOCB_ZONE_APPEND)> + res2 = blkdev_bio_end_io_append(bio); The name blkdev_bio_end_io_append() implies a bio end_io callback function, which is not the case. What about naming this blkdev_bio_res2() and move the if inside it ? > } else { > ret = blk_status_to_errno(dio->bio.bi_status); add "res2 = 0;" here and drop the declaration initialization. That will avoid doing the assignment twice for zone append case. > } > > - dio->iocb->ki_complete(iocb, ret, 0); > + dio->iocb->ki_complete(iocb, ret, res2); > if (dio->multi_bio) > bio_put(&dio->bio); > } else { > @@ -382,6 +398,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) > bio->bi_private = dio; > bio->bi_end_io = blkdev_bio_end_io; > bio->bi_ioprio = iocb->ki_ioprio; > + bio->bi_opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); Personally, I would prefer a plain "if () else". Or even better: change dio_bio_write_op() into dio_bio_op() and just have: bio->bi_opf = dio_bio_op(iocb); > > ret = bio_iov_iter_get_pages(bio, iter); > if (unlikely(ret)) { > @@ -391,11 +408,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) > } > > if (is_read) { > - bio->bi_opf = REQ_OP_READ; > if (dio->should_dirty) > bio_set_pages_dirty(bio); > } else { > - bio->bi_opf = dio_bio_write_op(iocb); > task_io_account_write(bio->bi_iter.bi_size); > } > > @@ -465,12 +480,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) > static ssize_t > blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) > { > + bool is_sync = is_sync_kiocb(iocb); > int nr_pages; > > + /* zone-append is supported only on async-kiocb */ > + if (is_sync && iocb->ki_flags & IOCB_ZONE_APPEND) > + return -EINVAL; > + > nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); > if (!nr_pages) > return 0; > - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) > + if (is_sync && nr_pages <= BIO_MAX_PAGES) > return __blkdev_direct_IO_simple(iocb, iter, nr_pages); > > return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 6c4ab4d..3202d9a 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -315,6 +315,7 @@ enum rw_hint { > #define IOCB_SYNC (1 << 5) > #define IOCB_WRITE (1 << 6) > #define IOCB_NOWAIT (1 << 7) > +#define IOCB_ZONE_APPEND (1 << 8) > > struct kiocb { > struct file *ki_filp; > @@ -3456,6 +3457,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) > ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); > if (flags & RWF_APPEND) > ki->ki_flags |= IOCB_APPEND; > + if (flags & RWF_ZONE_APPEND) { > + /* currently support block device only */ > + umode_t mode = file_inode(ki->ki_filp)->i_mode; > + > + if (!(S_ISBLK(mode))) > + return -EOPNOTSUPP; > + ki->ki_flags |= IOCB_ZONE_APPEND; > + } > return 0; > } > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index 379a612..1ce06e9 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -299,8 +299,11 @@ typedef int __bitwise __kernel_rwf_t; > /* per-IO O_APPEND */ > #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) > > +/* per-IO O_APPEND */ > +#define RWF_ZONE_APPEND ((__force __kernel_rwf_t)0x00000020) > + > /* mask of flags supported by the kernel */ > #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ > - RWF_APPEND) > + RWF_APPEND | RWF_ZONE_APPEND) > > #endif /* _UAPI_LINUX_FS_H */ > -- Damien Le Moal Western Digital Research