On 6/19/20 8:50 AM, Matthew Wilcox wrote: > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > The advantage of this patch is that we can avoid taking any filesystem > lock, as long as the pages being accessed are in the cache (and we don't > need to readahead any pages into the cache). We also avoid an indirect > function call in these cases. > > I'm sure reusing the name call_read_iter() is the wrong way to go about > this, but renaming all the callers would make this a larger patch. > I'm happy to do it if something like this stands a chance of being > accepted. > > Compared to Andreas' patch, I removed the -ECANCELED return value. > We can happily return 0 from generic_file_buffered_read() and it's less > code to handle that. I bypass the attempt to read from the page cache > for O_DIRECT reads, and for inodes which have no cached pages. Hopefully > this will avoid calling generic_file_buffered_read() for drivers which > implement read_iter() (although I haven't audited them all to check that > > This could go horribly wrong if filesystems rely on doing work in their > ->read_iter implementation (eg checking i_size after acquiring their > lock) instead of keeping the page cache uptodate. On the other hand, > the ->map_pages() method is already called without locks, so filesystems > should already be prepared for this. > > Arguably we could do something similar for writes. I'm a little more > scared of that patch since filesystems are more likely to want to do > things to keep their fies in sync for writes. I did a testing with NVMeOF target file backend with buffered I/O enabled with your patch and setting the IOCB_CACHED for each I/O ored '|' with IOCB_NOWAIT calling call_read_iter_cached() [1]. The name was changed from call_read_iter() -> call_read_iter_cached() [2]). For the file system I've used XFS and device was null_blk with memory backed so entire file was cached into the DRAM. Following are the performance numbers :- IOPS/Bandwidth :- default-page-cache: read: IOPS=1389k, BW=5424MiB/s (5688MB/s) default-page-cache: read: IOPS=1381k, BW=5395MiB/s (5657MB/s) default-page-cache: read: IOPS=1391k, BW=5432MiB/s (5696MB/s) iocb-cached-page-cache: read: IOPS=1403k, BW=5481MiB/s (5747MB/s) iocb-cached-page-cache: read: IOPS=1393k, BW=5439MiB/s (5704MB/s) iocb-cached-page-cache: read: IOPS=1399k, BW=5465MiB/s (5731MB/s) Submission lat :- default-page-cache: slat (usec): min=2, max=1076, avg= 3.71, default-page-cache: slat (usec): min=2, max=489, avg= 3.72, default-page-cache: slat (usec): min=2, max=1078, avg= 3.70, iocb-cached-page-cache: slat (usec): min=2, max=1731, avg= 3.70, iocb-cached-page-cache: slat (usec): min=2, max=2115, avg= 3.69, iocb-cached-page-cache: slat (usec): min=2, max=3055, avg= 3.70, CPU :- default-page-cache: cpu : usr=10.36%, sys=49.29%, ctx=5207722, default-page-cache: cpu : usr=10.49%, sys=49.15%, ctx=5179714, default-page-cache: cpu : usr=10.56%, sys=49.22%, ctx=5215474, iocb-cached-page-cache: cpu : usr=10.26%, sys=49.53%, ctx=5262214, iocb-cached-page-cache: cpu : usr=10.43%, sys=49.35%, ctx=5222433, iocb-cached-page-cache: cpu : usr=10.47%, sys=49.59%, ctx=5247344, Regards, Chaitanya [1] diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 0abbefd9925e..02fa272399b6 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -120,6 +120,9 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_filp = req->ns->file; iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); + if (rw == READ) + return call_read_iter_cached(req->ns->file, iocb, &iter); + return call_iter(iocb, &iter); } @@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) if (req->ns->buffered_io) { if (likely(!req->f.mpool_alloc) && - nvmet_file_execute_io(req, IOCB_NOWAIT)) + nvmet_file_execute_io(req, + IOCB_NOWAIT |IOCB_CACHED)) return; nvmet_file_submit_buffered_io(req); [2] diff --git a/fs/read_write.c b/fs/read_write.c index bbfa9b12b15e..d4bf2581ff0b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -401,6 +401,41 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t read_write == READ ? MAY_READ : MAY_WRITE); } +ssize_t call_read_iter_cached(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + ssize_t written, ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) + goto uncached; + if (!file->f_mapping->nrpages) + goto uncached; + + iocb->ki_flags |= IOCB_CACHED; + ret = generic_file_buffered_read(iocb, iter, 0); + iocb->ki_flags &= ~IOCB_CACHED; + + if (likely(!iov_iter_count(iter))) + return ret; + + if (ret == -EAGAIN) { + if (iocb->ki_flags & IOCB_NOWAIT) + return ret; + ret = 0; + } else if (ret < 0) { + return ret; + } + +uncached: + written = ret; + + ret = file->f_op->read_iter(iocb, iter); + if (ret > 0) + written += ret; + + return written ? written : ret; +} + static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c4ab4dc1cd7..3fc8b00cd140 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +#define IOCB_CACHED (1 << 8) struct kiocb { struct file *ki_filp; @@ -1901,6 +1902,8 @@ static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, return file->f_op->read_iter(kio, iter); } +ssize_t call_read_iter_cached(struct file *, struct kiocb *, struct iov_iter *); + static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..4ee97941a1f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,12 +2038,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_CACHED) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) { put_page(page); goto would_block; } -- 2.26.0