On Sat, Mar 28, 2020 at 01:50:12AM +0900, Johannes Thumshirn wrote: > Synchronous direct I/O to a sequential write only zone can be issued using > the new REQ_OP_ZONE_APPEND request operation. As dispatching multiple > BIOs can potentially result in reordering, we cannot support asynchronous > IO via this interface. We trivially can if the write size is smaller than the supported zone append size. We could slightly less trivially by chaining a new submission after the first bio completes. > +static void zonefs_zone_append_bio_endio(struct bio *bio) > +{ > + struct task_struct *waiter = bio->bi_private; > + > + WRITE_ONCE(bio->bi_private, NULL); > + blk_wake_io_task(waiter); > + > + bio_release_pages(bio, false); > + bio_put(bio); > +} > + > +static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + struct zonefs_inode_info *zi = ZONEFS_I(inode); > + struct block_device *bdev = inode->i_sb->s_bdev; > + ssize_t ret = 0; > + ssize_t size; > + struct bio *bio; > + unsigned max; > + int nr_pages; > + blk_qc_t qc; > + > + nr_pages = iov_iter_npages(from, BIO_MAX_PAGES); > + if (!nr_pages) > + return 0; > + > + max = queue_max_zone_append_sectors(bdev_get_queue(bdev)) << 9; > + max = ALIGN_DOWN(max, inode->i_sb->s_blocksize); > + iov_iter_truncate(from, max); > + > + bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set); > + if (!bio) > + return -ENOMEM; > + > + bio_set_dev(bio, bdev); > + bio->bi_iter.bi_sector = zi->i_zsector; > + bio->bi_write_hint = iocb->ki_hint; > + bio->bi_private = current; > + bio->bi_end_io = zonefs_zone_append_bio_endio; > + bio->bi_ioprio = iocb->ki_ioprio; > + bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; > + if (iocb->ki_flags & IOCB_DSYNC) > + bio->bi_opf |= REQ_FUA; > + > + ret = bio_iov_iter_get_pages(bio, from); > + if (unlikely(ret)) { > + bio->bi_status = BLK_STS_IOERR; > + bio_endio(bio); > + return ret; > + } > + size = bio->bi_iter.bi_size; > + task_io_account_write(ret); > + > + if (iocb->ki_flags & IOCB_HIPRI) > + bio_set_polled(bio, iocb); > + > + bio_get(bio); > + qc = submit_bio(bio); > + for (;;) { > + set_current_state(TASK_UNINTERRUPTIBLE); > + if (!READ_ONCE(bio->bi_private)) > + break; > + if (!(iocb->ki_flags & IOCB_HIPRI) || > + !blk_poll(bdev_get_queue(bdev), qc, true)) > + io_schedule(); > + } > + __set_current_state(TASK_RUNNING); > + > + if (unlikely(bio->bi_status)) > + ret = blk_status_to_errno(bio->bi_status); > + > + bio_put(bio); > + > + zonefs_file_write_dio_end_io(iocb, size, ret, 0); > + if (ret >= 0) { > + iocb->ki_pos += size; > + return size; > + } > + > + return ret; This looks like no one waits for I/O completion? Also it looks like it silently causes a short write, which probably needs to be documented..