From: Zheng Liu <wenqing.lz@xxxxxxxxxx> Currently we check i_size in buffered read path after we know the page is update. But it could return some zero-filled pages to the userspace when we do some append dio writes. We could use the following code snippet to reproduce this problem in a ext2/3/4 filesystem. code snippet: #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <string.h> #include <memory.h> #include <unistd.h> #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #include <errno.h> #include <pthread.h> #define BUF_ALIGN 1024 struct writer_data { int fd; size_t blksize; char *buf; }; static void *writer(void *arg) { struct writer_data *data = (struct writer_data *)arg; int ret; ret = write(data->fd, data->buf, data->blksize); if (ret < 0) fprintf(stderr, "write file failed: %s\n", strerror(errno)); return NULL; } int main(int argc, char *argv[]) { pthread_t tid; struct writer_data wdata; size_t max_blocks = 10 * 1024; size_t blksize = 1 * 1024 * 1024; char *rbuf, *wbuf; int readfd, writefd; int i, j; if (argc < 2) { fprintf(stderr, "usage: %s [filename]\n", argv[0]); exit(1); } writefd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU); if (writefd < 0) { fprintf(stderr, "failed to open wfile: %s\n", strerror(errno)); exit(1); } readfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU); if (readfd < 0) { fprintf(stderr, "failed to open rfile: %s\n", strerror(errno)); exit(1); } if (posix_memalign((void **)&wbuf, BUF_ALIGN, blksize)) { fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno)); exit(1); } if (posix_memalign((void **)&rbuf, 4096, blksize)) { fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno)); exit(1); } memset(wbuf, 'a', blksize); wdata.fd = writefd; wdata.blksize = blksize; wdata.buf = wbuf; for (i = 0; i < max_blocks; i++) { void *retval; int ret; ret = pthread_create(&tid, NULL, writer, &wdata); if (ret) { fprintf(stderr, "create thread failed: %s\n", strerror(errno)); exit(1); } memset(rbuf, 'b', blksize); do { ret = pread(readfd, rbuf, blksize, i * blksize); } while (ret <= 0); if (ret < 0) { fprintf(stderr, "read file failed: %s\n", strerror(errno)); exit(1); } if (pthread_join(tid, &retval)) { fprintf(stderr, "pthread join failed: %s\n", strerror(errno)); exit(1); } if (ret >= 0) { for (j = 0; j < ret; j++) { if (rbuf[j] != 'a') { fprintf(stderr, "encounter an error: offset %ld\n", i); goto err; } } } } err: free(wbuf); free(rbuf); return 0; } build & run: $ gcc code.c -o code -lpthread # build $ ./code ${filename} # run As we expected, we should read nothing or data with 'a'. But now we read data with '0'. I take a closer look at the code and it seems that there is a bug in vfs. Let me describe my found here. reader writer generic_file_aio_write() ->__generic_file_aio_write() ->generic_file_direct_write() generic_file_aio_read() ->do_generic_file_read() [fallback to buffered read] ->find_get_page() ->page_cache_sync_readahead() ->find_get_page() [in find_page label, we couldn't find a page before and after calling page_cache_sync_readahead(). So go to no_cached_page label] ->page_cache_alloc_cold() ->add_to_page_cache_lru() [in no_cached_page label, we alloc a page and goto readpage label.] ->aops->readpage() [in readpage label, readpage() callback is called and mpage_readpage() return a zero-filled page (e.g. ext3/4), and go to page_ok label] ->a_ops->direct_IO() ->i_size_write() [we enlarge the i_size] Here we check i_size [in page_ok label, we check i_size but it has been enlarged. Thus, we pass the check and return a zero-filled page] This commit first trims desc->count to avoid to read too much data if (*ppos + desc->count) >= i_size. Then we will return directly if desc->count == 0. After doing that, we could fix the above problem. Meanwhile this also can fix the problem that the reader does some buffered reads with append dio write. Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx> Cc: Dmitry Monakhov <dmonakhov@xxxxxxxxxx> Cc: Dave Chinner <david@xxxxxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Reviewed-by: Jan Kara <jack@xxxxxxx> Signed-off-by: Zheng Liu <wenqing.lz@xxxxxxxxxx> --- changelog: v3: * Rebase against 3.13-rc3 * Add 'Reviewed-by' tag because Jan has reviewed this patch - Zheng mm/filemap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index b7749a92021c..622d49ac2a24 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1108,18 +1108,25 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos, pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; + loff_t isize; int error; + /* we need to trim desc->count to avoid expose stale data to user */ + isize = i_size_read(inode); + if (*ppos + desc->count >= isize) + desc->count = isize - *ppos; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; + if (desc->count == 0) + goto out; + for (;;) { struct page *page; pgoff_t end_index; - loff_t isize; unsigned long nr, ret; cond_resched(); -- 1.7.9.7 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html