[PATCH 2/2] zonefs: use zone-append for AIO as well

Johannes Thumshirn <johannes.thumshirn@xxxxxxx> · Mon, 20 Jul 2020 22:21:18 +0900

If we get an async I/O iocb with an O_APPEND or RWF_APPEND flag set,
submit it using REQ_OP_ZONE_APPEND to the block layer.

As an REQ_OP_ZONE_APPEND bio must not be split, this does come with an
additional constraint, namely the buffer submitted to zonefs must not be
bigger than the max zone append size of the underlying device. For
synchronous I/O we don't care about this constraint as we can return short
writes, for AIO we need to return an error on too big buffers.

On a successful completion, the position the data is written to is
returned via AIO's res2 field to the calling application.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@xxxxxxx>
---
 fs/zonefs/super.c  | 143 +++++++++++++++++++++++++++++++++++++++------
 fs/zonefs/zonefs.h |   3 +
 2 files changed, 128 insertions(+), 18 deletions(-)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 5832e9f69268..f155a658675b 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -24,6 +24,8 @@
 
 #include "zonefs.h"
 
+static struct bio_set zonefs_dio_bio_set;
+
 static inline int zonefs_zone_mgmt(struct zonefs_inode_info *zi,
 				   enum req_opf op)
 {
@@ -700,16 +702,71 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = {
 	.end_io			= zonefs_file_write_dio_end_io,
 };
 
+struct zonefs_dio {
+	struct kiocb		*iocb;
+	struct task_struct	*waiter;
+	int			error;
+	struct work_struct	work;
+	size_t			size;
+	u64			sector;
+	struct completion	completion;
+	struct bio		bio;
+};
+
+static void zonefs_dio_complete_work(struct work_struct *work)
+{
+	struct zonefs_dio *dio = container_of(work, struct zonefs_dio, work);
+	struct kiocb *iocb = dio->iocb;
+	size_t size = dio->size;
+	int ret;
+
+	ret = zonefs_file_write_dio_end_io(iocb, size, dio->error, 0);
+	if (ret == 0)
+		iocb->ki_pos += size;
+
+	iocb->ki_complete(iocb, ret, dio->sector);
+
+	bio_put(&dio->bio);
+}
+
+static void zonefs_file_dio_append_end_io(struct bio *bio)
+{
+	struct zonefs_dio *dio = container_of(bio, struct zonefs_dio, bio);
+	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (bio->bi_status)
+		dio->error = blk_status_to_errno(bio->bi_status);
+	else
+		dio->sector = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+	if (is_sync_kiocb(iocb)) {
+		struct task_struct *waiter = dio->waiter;
+
+		blk_wake_io_task(waiter);
+		WRITE_ONCE(dio->waiter, NULL);
+	} else {
+		INIT_WORK(&dio->work, zonefs_dio_complete_work);
+		queue_work(ZONEFS_SB(inode->i_sb)->s_dio_done_wq, &dio->work);
+	}
+
+	bio_release_pages(bio, false);
+	bio_put(bio);
+}
+
 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);
 	struct block_device *bdev = inode->i_sb->s_bdev;
+	struct zonefs_dio *dio;
 	unsigned int max;
 	struct bio *bio;
-	ssize_t size;
 	int nr_pages;
 	ssize_t ret;
+	bool sync = is_sync_kiocb(iocb);
+	bool polled;
+	blk_qc_t qc;
 
 	max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
 	max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
@@ -720,15 +777,24 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 		return 0;
 
 
-	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+	bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &zonefs_dio_bio_set);
 	if (!bio)
 		return -ENOMEM;
 
+	dio = container_of(bio, struct zonefs_dio, bio);
+	dio->iocb = iocb;
+	dio->error = 0;
+	if (sync) {
+		dio->waiter = current;
+		init_completion(&dio->completion);
+	}
+
 	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = zi->i_zsector;
 	bio->bi_write_hint = iocb->ki_hint;
 	bio->bi_ioprio = iocb->ki_ioprio;
 	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
+	bio->bi_end_io = zonefs_file_dio_append_end_io;
 	if (iocb->ki_flags & IOCB_DSYNC)
 		bio->bi_opf |= REQ_FUA;
 
@@ -737,21 +803,41 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 		bio_io_error(bio);
 		return ret;
 	}
-	size = bio->bi_iter.bi_size;
+	dio->size = bio->bi_iter.bi_size;
 	task_io_account_write(ret);
 
-	if (iocb->ki_flags & IOCB_HIPRI)
+	if (iocb->ki_flags & IOCB_HIPRI) {
 		bio_set_polled(bio, iocb);
+		polled = true;
+	}
 
-	ret = submit_bio_wait(bio);
+	bio_get(bio);
+	qc = submit_bio(bio);
 
-	bio_put(bio);
+	if (polled)
+		WRITE_ONCE(iocb->ki_cookie, qc);
 
-	zonefs_file_write_dio_end_io(iocb, size, ret, 0);
-	if (ret >= 0) {
-		iocb->ki_pos += size;
-		return size;
+	if (!sync)
+		return -EIOCBQUEUED;
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(dio->waiter))
+			break;
+
+		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		    !blk_poll(bdev_get_queue(bdev), qc, true))
+			blk_io_schedule();
 	}
+	__set_current_state(TASK_RUNNING);
+
+	ret = zonefs_file_write_dio_end_io(iocb, dio->size,
+					   dio->error, 0);
+	if (ret == 0) {
+		ret = dio->size;
+		iocb->ki_pos += dio->size;
+	}
+	bio_put(bio);
 
 	return ret;
 }
@@ -813,7 +899,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 			goto inode_unlock;
 		}
 		mutex_unlock(&zi->i_truncate_mutex);
-		append = sync;
+		append = sync || iocb->ki_flags & IOCB_APPEND;
 	}
 
 	if (append)
@@ -821,8 +907,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 	else
 		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
 				   &zonefs_write_dio_ops, sync);
-	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
-	    (ret > 0 || ret == -EIOCBQUEUED)) {
+
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		if (ret > 0)
 			count = ret;
 		mutex_lock(&zi->i_truncate_mutex);
@@ -1580,6 +1666,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root)
 		goto cleanup;
 
+	sbi->s_dio_done_wq = alloc_workqueue("zonefs-dio/%s", WQ_MEM_RECLAIM,
+					     0, sb->s_id);
+	if (!sbi->s_dio_done_wq)
+		goto cleanup;
+
 	/* Create and populate files in zone groups directories */
 	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
 		ret = zonefs_create_zgroup(&zd, t);
@@ -1603,8 +1694,14 @@ static void zonefs_kill_super(struct super_block *sb)
 {
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 
-	if (sb->s_root)
+	if (sb->s_root) {
 		d_genocide(sb->s_root);
+
+		if (sbi->s_dio_done_wq) {
+			destroy_workqueue(sbi->s_dio_done_wq);
+			sbi->s_dio_done_wq = NULL;
+		}
+	}
 	kill_block_super(sb);
 	kfree(sbi);
 }
@@ -1651,17 +1748,27 @@ static int __init zonefs_init(void)
 	if (ret)
 		return ret;
 
+	ret = bioset_init(&zonefs_dio_bio_set, 4,
+			  offsetof(struct zonefs_dio, bio), BIOSET_NEED_BVECS);
+	if (ret)
+		goto destroy_inodecache;
+
 	ret = register_filesystem(&zonefs_type);
-	if (ret) {
-		zonefs_destroy_inodecache();
-		return ret;
-	}
+	if (ret)
+		goto exit_bioset;
 
 	return 0;
+
+exit_bioset:
+	bioset_exit(&zonefs_dio_bio_set);
+destroy_inodecache:
+	zonefs_destroy_inodecache();
+	return ret;
 }
 
 static void __exit zonefs_exit(void)
 {
+	bioset_exit(&zonefs_dio_bio_set);
 	zonefs_destroy_inodecache();
 	unregister_filesystem(&zonefs_type);
 }
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 51141907097c..fe91df5eeffe 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -185,6 +185,9 @@ struct zonefs_sb_info {
 
 	unsigned int		s_max_open_zones;
 	atomic_t		s_open_zones;
+
+	/* AIO completions deferred from interrupt context */
+	struct workqueue_struct *s_dio_done_wq;
 };
 
 static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
-- 
2.26.2