On Fri, May 26, 2023 at 12:55:44AM -0700, Luis Chamberlain wrote: > Future work: > > o shmem_file_read_iter() And as for this, this is what I'm up to, but for the life of me I can't figure out why I end up with an empty new line at the end of my test with this, the same simple test as described in the patch "shmem: add support to customize block size order". I end up with: root@iomap ~ # ./run.sh 2dcc06b7ca3b7dd8b5626af83c1be3cb08ddc76c /root/ordered.txt a0466a798f2d967c143f0f716c344660dc360f78 /data-tmpfs/ordered.txt File: /data-tmpfs/ordered.txt Size: 6888896 Blocks: 16384 IO Block: 4194304 regular file Device: 0,44 Inode: 2 Links: 1 Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root) Access: 2023-05-26 01:06:15.566330524 -0700 Modify: 2023-05-26 01:06:15.554330477 -0700 Change: 2023-05-26 01:06:15.554330477 -0700 Birth: 2023-05-26 01:06:15.534330399 -0700 root@iomap ~ # diff -u /root/ordered.txt /data-tmpfs/ordered.txt --- /root/ordered.txt 2023-05-25 16:50:53.755019418 -0700 +++ /data-tmpfs/ordered.txt 2023-05-26 01:06:15.554330477 -0700 @@ -999998,3 +999998,4 @@ 999998 999999 1000000 + \ No newline at end of file root@iomap ~ # cat run.sh #!/bin/bash # time for i in $(seq 1 1000000); do echo $i >> # /root/ordered.txt; done sha1sum /root/ordered.txt mount -t tmpfs -o size=8M,border=22 -o noswap tmpfs /data-tmpfs/ cp /root/ordered.txt /data-tmpfs/ sha1sum /data-tmpfs/ordered.txt stat /data-tmpfs/ordered.txt >From 61008f03217b1524da317928885ef68a67abc773 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain <mcgrof@xxxxxxxxxx> Date: Wed, 19 Apr 2023 20:42:54 -0700 Subject: [PATCH] shmem: convert shmem_file_read_iter() to folios Signed-off-by: Luis Chamberlain <mcgrof@xxxxxxxxxx> --- mm/shmem.c | 74 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 777e953df62e..2d3512f6dd30 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2431,6 +2431,10 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block inode->i_ino = ino; inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; + if (sb->s_flags & SB_KERNMOUNT) + inode->i_blkbits = PAGE_SHIFT; + else + inode->i_blkbits = sb->s_blocksize_bits; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_generation = get_random_u32(); info = SHMEM_I(inode); @@ -2676,19 +2680,42 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + u64 bsize = i_blocksize(inode); pgoff_t index; unsigned long offset; int error = 0; ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; + /* + * Although our index is page specific, we can read a blocksize at a + * time as we use a folio per block. + */ index = *ppos >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; + + /* + * We're going to read a folio at a time of size blocksize. + * + * The offset represents the position in the folio where we are + * currently doing reads on. It starts off by the offset position in the + * first folio where we were asked to start our read. It later gets + * incremented by the number of bytes we read per folio. After the + * first folio is read offset would be 0 as we are starting to read the + * next folio at offset 0. We'd then read a full blocksize at a time + * until we're done. + */ + offset = *ppos & (bsize - 1); for (;;) { struct folio *folio = NULL; - struct page *page = NULL; pgoff_t end_index; + /* + * nr represents the number of bytes we can read per folio, + * and this will depend on the blocksize set. On the last + * folio nr represents how much data on the last folio is + * valid to be read on the inode. + */ unsigned long nr, ret; loff_t i_size = i_size_read(inode); @@ -2696,7 +2723,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (index > end_index) break; if (index == end_index) { - nr = i_size & ~PAGE_MASK; + nr = i_size & (bsize - 1); if (nr <= offset) break; } @@ -2709,9 +2736,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } if (folio) { folio_unlock(folio); - - page = folio_file_page(folio, index); - if (PageHWPoison(page)) { + if (is_folio_hwpoison(folio)) { folio_put(folio); error = -EIO; break; @@ -2722,49 +2747,56 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ - nr = PAGE_SIZE; + nr = bsize; + WARN_ON(!(sb->s_flags & SB_KERNMOUNT) && folio && bsize != folio_size(folio)); i_size = i_size_read(inode); end_index = i_size >> PAGE_SHIFT; if (index == end_index) { - nr = i_size & ~PAGE_MASK; + nr = i_size & (bsize - 1); if (nr <= offset) { if (folio) folio_put(folio); break; } } + + /* + * On the first folio read the number of bytes we can read + * will be blocksize - offset. On subsequent reads we can read + * blocksize at time until iov_iter_count(to) == 0. + */ nr -= offset; if (folio) { /* - * If users can be writing to this page using arbitrary + * If users can be writing to this folio using arbitrary * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. + * before reading the folio on the kernel side. */ if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); /* - * Mark the page accessed if we read the beginning. + * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); /* - * Ok, we have the page, and it's up-to-date, so + * Ok, we have the folio, and it's up-to-date, so * now we can copy it to user space... */ - ret = copy_page_to_iter(page, offset, nr, to); + ret = copy_folio_to_iter(folio, offset, nr, to); folio_put(folio); } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but * clear_user() not so much, that it is noticeably - * faster to copy the zero page instead of clearing. + * faster to copy the zero folio instead of clearing. */ - ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); + ret = copy_folio_to_iter(page_folio(ZERO_PAGE(0)), offset, nr, to); } else { /* - * But submitting the same page twice in a row to + * But submitting the same folio twice in a row to * splice() - or others? - can result in confusion: * so don't attempt that optimization on pipes etc. */ @@ -2773,8 +2805,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) retval += ret; offset += ret; + + /* + * Due to usage of folios per blocksize we know this will + * actually read blocksize at a time after the first block read + * at offset. + */ index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; + offset &= (bsize - 1); if (!iov_iter_count(to)) break; -- 2.39.2