On 2020/03/25 0:41, Christoph Hellwig wrote: > On Wed, Mar 25, 2020 at 12:24:53AM +0900, Johannes Thumshirn wrote: >> @@ -39,6 +40,7 @@ struct iomap_dio { >> struct task_struct *waiter; >> struct request_queue *last_queue; >> blk_qc_t cookie; >> + sector_t sector; >> } submit; >> >> /* used for aio completion: */ >> @@ -151,6 +153,9 @@ static void iomap_dio_bio_end_io(struct bio *bio) >> if (bio->bi_status) >> iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); >> >> + if (dio->flags & IOMAP_DIO_ZONE_APPEND) >> + dio->submit.sector = bio->bi_iter.bi_sector; > > The submit member in struct iomap_dio is for submit-time information, > while this is used in the completion path.. > >> nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); >> iomap_dio_submit_bio(dio, iomap, bio); >> + >> + /* >> + * Issuing multiple BIOs for a large zone append write can >> + * result in reordering of the write fragments and to data >> + * corruption. So always stop after the first BIO is issued. >> + */ >> + if (zone_append) >> + break; > > At least for a normal file system that is absolutely not true. If > zonefs is so special it might be better of just using a slightly tweaked > copy of blkdev_direct_IO rather than using iomap. It would be very nice to not have to add this direct BIO use case in zonefs since that would be only for writes to sequential zones while all other operations use iomap. So instead of this, what about using a flag as Dave suggested (see below comment too) ? > >> @@ -446,6 +486,11 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, >> flags |= IOMAP_WRITE; >> dio->flags |= IOMAP_DIO_WRITE; >> >> + if (iocb->ki_flags & IOCB_ZONE_APPEND) { >> + flags |= IOMAP_ZONE_APPEND; >> + dio->flags |= IOMAP_DIO_ZONE_APPEND; >> + } >> + >> /* for data sync or sync, we need sync completion processing */ >> if (iocb->ki_flags & IOCB_DSYNC) >> dio->flags |= IOMAP_DIO_NEED_SYNC; >> @@ -516,6 +561,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, >> iov_iter_revert(iter, pos - dio->i_size); >> break; >> } >> + >> + /* >> + * Zone append writes cannot be split and be shorted. Break >> + * here to let the user know instead of sending more IOs which >> + * could get reordered and corrupt the written data. >> + */ >> + if (flags & IOMAP_ZONE_APPEND) >> + break; > > But that isn't what we do here. You exit after a single apply iteration > which is perfectly fine - at at least for a normal file system, zonefs > is rather weird. The comment is indeed not clear. For the short write, as Dave suggested, we should have a special flag for it. So would you be OK if we replace this with something like if (flags & IOMAP_SHORT_WRITE) break; Normal file systems with real block mapping metadata would not need this as they can perfectly handle non sequential zone append fragments of a large DIO. But zonefs will need the short writes since it lacks file block mapping metadata. > >> + >> } while ((count = iov_iter_count(iter)) > 0); >> blk_finish_plug(&plug); >> >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index 3cd4fe6b845e..aa4ad705e549 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -314,6 +314,7 @@ enum rw_hint { >> #define IOCB_SYNC (1 << 5) >> #define IOCB_WRITE (1 << 6) >> #define IOCB_NOWAIT (1 << 7) >> +#define IOCB_ZONE_APPEND (1 << 8) > > I don't think the iocb is the right interface for passing this > kind of information. We currently pass a bool wait to iomap_dio_rw > which really should be flags. I have a pending patch for that. Is that patch queued in iomap or xfs tree ? Could you point us to it please ? > >> diff --git a/include/linux/iomap.h b/include/linux/iomap.h >> index 8b09463dae0d..16c17a79e53d 100644 >> --- a/include/linux/iomap.h >> +++ b/include/linux/iomap.h >> @@ -68,7 +68,6 @@ struct vm_fault; >> */ >> #define IOMAP_F_PRIVATE 0x1000 >> >> - > > Spurious whitespace change. > >> /* >> * Magic value for addr: >> */ >> @@ -95,6 +94,17 @@ iomap_sector(struct iomap *iomap, loff_t pos) >> return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; >> } >> >> +/* >> + * Flags for iomap_begin / iomap_end. No flag implies a read. >> + */ >> +#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ >> +#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ >> +#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ >> +#define IOMAP_FAULT (1 << 3) /* mapping for page fault */ >> +#define IOMAP_DIRECT (1 << 4) /* direct I/O */ >> +#define IOMAP_NOWAIT (1 << 5) /* do not block */ >> +#define IOMAP_ZONE_APPEND (1 << 6) /* Use zone append for writes */ > > Why is this moved around? > -- Damien Le Moal Western Digital Research