On 7/5/24 12:04, Jingbo Xu wrote: > Sometimes the file offset alignment needs to be opt-in to achieve the > optimum performance at the backend store. > > For example when ErasureCode [1] is used at the backend store, the > optimum write performance is achieved when the WRITE request is aligned > with the stripe size of ErasureCode. Otherwise a non-aligned WRITE > request needs to be split at the stripe size boundary. It is quite > costly to handle these split partial requests, as firstly the whole > stripe to which the split partial request belongs needs to be read out, > then overwrite the read stripe buffer with the request, and finally write > the whole stripe back to the persistent storage. > > Thus the backend store can suffer severe performance degradation when > WRITE requests can not fit into one stripe exactly. The write performance > can be 10x slower when the request is 256KB in size given 4MB stripe size. > Also there can be 50% performance degradation in theory if the request > is not stripe boundary aligned. > > Besides, the conveyed test indicates that, the non-alignment issue > becomes more severe when decreasing fuse's max_ratio, maybe partly > because the background writeback now is more likely to run parallelly > with the dirtier. > > fuse's max_ratio ratio of aligned WRITE requests > ---------------- ------------------------------- > 70 99.9% > 40 74% > 20 45% > 10 20% > > With the patched version, which makes the alignment constraint opt-in > when constructing WRITE requests, the ratio of aligned WRITE requests > increases to 98% (previously 20%) when fuse's max_ratio is 10. > > [1] https://lore.kernel.org/linux-fsdevel/20240124070512.52207-1-jefflexu@xxxxxxxxxxxxxxxxx/T/#m9bce469998ea6e4f911555c6f7be1e077ce3d8b4 > Signed-off-by: Jingbo Xu <jefflexu@xxxxxxxxxxxxxxxxx> > > Signed-off-by: Jingbo Xu <jefflexu@xxxxxxxxxxxxxxxxx> > --- > fs/fuse/file.c | 4 ++++ > fs/fuse/fuse_i.h | 6 ++++++ > fs/fuse/inode.c | 9 +++++++++ > include/uapi/linux/fuse.h | 9 ++++++++- > 4 files changed, 27 insertions(+), 1 deletion(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index f39456c65ed7..f9b477016c2e 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -2246,6 +2246,10 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, > if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) > return true; > > + /* Reached alignment */ > + if (fc->opt_alignment && !(page->index % fc->opt_alignment_pages)) > + return true; I link the idea, but couldn't it just do that with fc->max_pages? I'm asking because fc->opt_alignment_pages takes another uint32_t in fuse_init_out and there is not so much space left anymore. > + > return false; > } > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index f23919610313..5963571b394c 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -860,6 +860,9 @@ struct fuse_conn { > /** Passthrough support for read/write IO */ > unsigned int passthrough:1; > > + /* Foffset alignment required for optimum performance */ > + unsigned int opt_alignment:1; > + > /** Maximum stack depth for passthrough backing files */ > int max_stack_depth; > > @@ -917,6 +920,9 @@ struct fuse_conn { > /** IDR for backing files ids */ > struct idr backing_files_map; > #endif > + > + /* The foffset alignment in PAGE_SIZE */ > + unsigned int opt_alignment_pages; > }; > > /* > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 99e44ea7d875..9266b22cce8e 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -1331,6 +1331,15 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, > } > if (flags & FUSE_NO_EXPORT_SUPPORT) > fm->sb->s_export_op = &fuse_export_fid_operations; > + > + /* fallback to default if opt_alignment <= PAGE_SHIFT */ > + if (flags & FUSE_OPT_ALIGNMENT) { > + if (arg->opt_alignment > PAGE_SHIFT) { > + fc->opt_alignment = 1; > + fc->opt_alignment_pages = 1 << > + (arg->opt_alignment - PAGE_SHIFT); opt_alignment is the number of bits required for alignment? Not very user friendly, from my point of view would be better to have this in a byte or kb unit. Thanks, Bernd > + } > + } > } else { > ra_pages = fc->max_read / PAGE_SIZE; > fc->no_lock = 1; > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index d08b99d60f6f..2c6ad1577591 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -217,6 +217,9 @@ > * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag > * - add FUSE_NO_EXPORT_SUPPORT init flag > * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag > + * > + * 7.41 > + * - add opt_alignment to fuse_init_out, add FUSE_OPT_ALIGNMENT init flag > */ > > #ifndef _LINUX_FUSE_H > @@ -421,6 +424,8 @@ struct fuse_file_lock { > * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support > * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit > * of the request ID indicates resend requests > + * FUSE_OPT_ALIGNMENT: init_out.opt_alignment contains log2(byte alignment) for > + * foffset alignment for optimum write performance > */ > #define FUSE_ASYNC_READ (1 << 0) > #define FUSE_POSIX_LOCKS (1 << 1) > @@ -463,6 +468,7 @@ struct fuse_file_lock { > #define FUSE_PASSTHROUGH (1ULL << 37) > #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) > #define FUSE_HAS_RESEND (1ULL << 39) > +#define FUSE_OPT_ALIGNMENT (1ULL << 40) > > /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ > #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP > @@ -893,7 +899,8 @@ struct fuse_init_out { > uint16_t map_alignment; > uint32_t flags2; > uint32_t max_stack_depth; > - uint32_t unused[6]; > + uint32_t opt_alignment; > + uint32_t unused[5]; > }; > > #define CUSE_INIT_INFO_MAX 4096