Hi Amir, On 2019/11/19 下午12:22, Amir Goldstein wrote: > On Tue, Nov 19, 2019 at 4:14 AM Jiufei Xue <jiufei.xue@xxxxxxxxxxxxxxxxx> wrote: >> >> A performance regression is observed since linux v4.19 when we do aio >> test using fio with iodepth 128 on overlayfs. And we found that queue >> depth of the device is always 1 which is unexpected. >> >> After investigation, it is found that commit 16914e6fc7 >> (“ovl: add ovl_read_iter()”) and commit 2a92e07edc >> (“ovl: add ovl_write_iter()”) use do_iter_readv_writev() to submit >> requests to real filesystem. Async IOs are converted to sync IOs here >> and cause performance regression. >> >> So implement async IO for stacked reading and writing. >> >> Signed-off-by: Jiufei Xue <jiufei.xue@xxxxxxxxxxxxxxxxx> >> --- >> fs/overlayfs/file.c | 97 +++++++++++++++++++++++++++++++++++++++++------- >> fs/overlayfs/overlayfs.h | 2 + >> fs/overlayfs/super.c | 12 +++++- >> 3 files changed, 95 insertions(+), 16 deletions(-) >> >> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c >> index e235a63..07d94e7 100644 >> --- a/fs/overlayfs/file.c >> +++ b/fs/overlayfs/file.c >> @@ -11,6 +11,14 @@ >> #include <linux/uaccess.h> >> #include "overlayfs.h" >> >> +struct ovl_aio_req { >> + struct kiocb iocb; >> + struct kiocb *orig_iocb; >> + struct fd fd; >> +}; >> + >> +static struct kmem_cache *ovl_aio_request_cachep; >> + >> static char ovl_whatisit(struct inode *inode, struct inode *realinode) >> { >> if (realinode != ovl_inode_upper(inode)) >> @@ -225,6 +233,21 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) >> return flags; >> } >> >> +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) >> +{ >> + struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); >> + struct kiocb *orig_iocb = aio_req->orig_iocb; >> + >> + if (iocb->ki_flags & IOCB_WRITE) >> + file_end_write(iocb->ki_filp); >> + >> + orig_iocb->ki_pos = iocb->ki_pos; >> + orig_iocb->ki_complete(orig_iocb, res, res2); >> + >> + fdput(aio_req->fd); >> + kmem_cache_free(ovl_aio_request_cachep, aio_req); >> +} >> + >> static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) >> { >> struct file *file = iocb->ki_filp; >> @@ -240,14 +263,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) >> return ret; >> >> old_cred = ovl_override_creds(file_inode(file)->i_sb); >> - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, >> - ovl_iocb_to_rwf(iocb)); >> + if (is_sync_kiocb(iocb)) { >> + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, >> + ovl_iocb_to_rwf(iocb)); >> + ovl_file_accessed(file); >> + fdput(real); >> + } else { >> + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, >> + GFP_NOFS); >> + aio_req->fd = real; >> + aio_req->orig_iocb = iocb; >> + kiocb_clone(&aio_req->iocb, iocb, real.file); >> + aio_req->iocb.ki_complete = ovl_aio_rw_complete; >> + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); >> + ovl_file_accessed(file); > > That should be done in completion/error > Refer to function generic_file_read_iter(), in direct IO path, file_accessed() is done before IO submission, so I think ovl_file_accessed() should be done here no matter completion/error or IO is queued. >> + if (ret != -EIOCBQUEUED) { >> + iocb->ki_pos = aio_req->iocb.ki_pos; >> + fdput(real); >> + kmem_cache_free(ovl_aio_request_cachep, aio_req); >> + } > > Suggest cleanup helper for completion/error > Yes, that will be more clearly. I will add a cleanup helper in version 2. > >> + } >> revert_creds(old_cred); >> >> - ovl_file_accessed(file); >> - >> - fdput(real); >> - >> return ret; >> } >> >> @@ -275,16 +312,32 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) >> >> old_cred = ovl_override_creds(file_inode(file)->i_sb); >> file_start_write(real.file); >> - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, >> - ovl_iocb_to_rwf(iocb)); >> - file_end_write(real.file); >> + if (is_sync_kiocb(iocb)) { >> + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, >> + ovl_iocb_to_rwf(iocb)); >> + file_end_write(real.file); >> + /* Update size */ >> + ovl_copyattr(ovl_inode_real(inode), inode); >> + fdput(real); >> + } else { >> + struct ovl_aio_req *aio_req = kmem_cache_alloc(ovl_aio_request_cachep, >> + GFP_NOFS); >> + aio_req->fd = real; >> + aio_req->orig_iocb = iocb; >> + kiocb_clone(&aio_req->iocb, iocb, real.file); >> + aio_req->iocb.ki_complete = ovl_aio_rw_complete; >> + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); >> + /* Update size */ >> + ovl_copyattr(ovl_inode_real(inode), inode); > > That should be in completion/error > Yes, I will move it to the newly added cleanup helper. Thanks, Jiufei > Thanks, > Amir. >