On Wed, Mar 25, 2020 at 12:24:53AM +0900, Johannes Thumshirn wrote: > @@ -39,6 +40,7 @@ struct iomap_dio { > struct task_struct *waiter; > struct request_queue *last_queue; > blk_qc_t cookie; > + sector_t sector; > } submit; > > /* used for aio completion: */ > @@ -151,6 +153,9 @@ static void iomap_dio_bio_end_io(struct bio *bio) > if (bio->bi_status) > iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); > > + if (dio->flags & IOMAP_DIO_ZONE_APPEND) > + dio->submit.sector = bio->bi_iter.bi_sector; The submit member in struct iomap_dio is for submit-time information, while this is used in the completion path.. > nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); > iomap_dio_submit_bio(dio, iomap, bio); > + > + /* > + * Issuing multiple BIOs for a large zone append write can > + * result in reordering of the write fragments and to data > + * corruption. So always stop after the first BIO is issued. > + */ > + if (zone_append) > + break; At least for a normal file system that is absolutely not true. If zonefs is so special it might be better of just using a slightly tweaked copy of blkdev_direct_IO rather than using iomap. > @@ -446,6 +486,11 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > flags |= IOMAP_WRITE; > dio->flags |= IOMAP_DIO_WRITE; > > + if (iocb->ki_flags & IOCB_ZONE_APPEND) { > + flags |= IOMAP_ZONE_APPEND; > + dio->flags |= IOMAP_DIO_ZONE_APPEND; > + } > + > /* for data sync or sync, we need sync completion processing */ > if (iocb->ki_flags & IOCB_DSYNC) > dio->flags |= IOMAP_DIO_NEED_SYNC; > @@ -516,6 +561,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > iov_iter_revert(iter, pos - dio->i_size); > break; > } > + > + /* > + * Zone append writes cannot be split and be shorted. Break > + * here to let the user know instead of sending more IOs which > + * could get reordered and corrupt the written data. > + */ > + if (flags & IOMAP_ZONE_APPEND) > + break; But that isn't what we do here. You exit after a single apply iteration which is perfectly fine - at at least for a normal file system, zonefs is rather weird. > + > } while ((count = iov_iter_count(iter)) > 0); > blk_finish_plug(&plug); > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 3cd4fe6b845e..aa4ad705e549 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -314,6 +314,7 @@ enum rw_hint { > #define IOCB_SYNC (1 << 5) > #define IOCB_WRITE (1 << 6) > #define IOCB_NOWAIT (1 << 7) > +#define IOCB_ZONE_APPEND (1 << 8) I don't think the iocb is the right interface for passing this kind of information. We currently pass a bool wait to iomap_dio_rw which really should be flags. I have a pending patch for that. > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index 8b09463dae0d..16c17a79e53d 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -68,7 +68,6 @@ struct vm_fault; > */ > #define IOMAP_F_PRIVATE 0x1000 > > - Spurious whitespace change. > /* > * Magic value for addr: > */ > @@ -95,6 +94,17 @@ iomap_sector(struct iomap *iomap, loff_t pos) > return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; > } > > +/* > + * Flags for iomap_begin / iomap_end. No flag implies a read. > + */ > +#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ > +#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ > +#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ > +#define IOMAP_FAULT (1 << 3) /* mapping for page fault */ > +#define IOMAP_DIRECT (1 << 4) /* direct I/O */ > +#define IOMAP_NOWAIT (1 << 5) /* do not block */ > +#define IOMAP_ZONE_APPEND (1 << 6) /* Use zone append for writes */ Why is this moved around?