Re: [PATCH v2] vfs: fix a bug when we do some dio reads with append dio writes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri 29-11-13 09:07:00, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@xxxxxxxxxx>
> 
> Currently we check i_size in buffered read path after we know the page
> is update.  But it could return some zero-filled pages to the userspace
> when we do some append dio writes.  We could use the following code
> snippet to reproduce this problem in a ext2/3/4 filesystem.
> 
> code snippet:
>   #define _GNU_SOURCE
> 
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <string.h>
>   #include <memory.h>
> 
>   #include <unistd.h>
>   #include <fcntl.h>
>   #include <sys/types.h>
>   #include <sys/stat.h>
>   #include <errno.h>
> 
>   #include <pthread.h>
> 
>   #define BUF_ALIGN	1024
> 
>   struct writer_data {
>   	int fd;
>   	size_t blksize;
>   	char *buf;
>   };
> 
>   static void *writer(void *arg)
>   {
>   	struct writer_data *data = (struct writer_data *)arg;
>   	int ret;
> 
>   	ret = write(data->fd, data->buf, data->blksize);
>   	if (ret < 0)
>   		fprintf(stderr, "write file failed: %s\n", strerror(errno));
> 
>   	return NULL;
>   }
> 
>   int main(int argc, char *argv[])
>   {
>   	pthread_t tid;
>   	struct writer_data wdata;
>   	size_t max_blocks = 10 * 1024;
>   	size_t blksize = 1 * 1024 * 1024;
>   	char *rbuf, *wbuf;
>   	int readfd, writefd;
>   	int i, j;
> 
>   	if (argc < 2) {
>   		fprintf(stderr, "usage: %s [filename]\n", argv[0]);
>   		exit(1);
>   	}
> 
>   	writefd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
>   	if (writefd < 0) {
>   		fprintf(stderr, "failed to open wfile: %s\n", strerror(errno));
>   		exit(1);
>   	}
>   	readfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
>   	if (readfd < 0) {
>   		fprintf(stderr, "failed to open rfile: %s\n", strerror(errno));
>   		exit(1);
>   	}
> 
>   	if (posix_memalign((void **)&wbuf, BUF_ALIGN, blksize)) {
>   		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
>   		exit(1);
>   	}
> 
>   	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
>   		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
>   		exit(1);
>   	}
> 
>   	memset(wbuf, 'a', blksize);
> 
>   	wdata.fd = writefd;
>   	wdata.blksize = blksize;
>   	wdata.buf = wbuf;
> 
>   	for (i = 0; i < max_blocks; i++) {
>   		void *retval;
>   		int ret;
> 
>   		ret = pthread_create(&tid, NULL, writer, &wdata);
>   		if (ret) {
>   			fprintf(stderr, "create thread failed: %s\n", strerror(errno));
>   			exit(1);
>   		}
> 
>   		memset(rbuf, 'b', blksize);
>   		do {
>   			ret = pread(readfd, rbuf, blksize, i * blksize);
>   		} while (ret <= 0);
> 
>   		if (ret < 0) {
>   			fprintf(stderr, "read file failed: %s\n", strerror(errno));
>   			exit(1);
>   		}
> 
>   		if (pthread_join(tid, &retval)) {
>   			fprintf(stderr, "pthread join failed: %s\n", strerror(errno));
>   			exit(1);
>   		}
> 
>   		if (ret >= 0) {
>   			for (j = 0; j < ret; j++) {
>   				if (rbuf[j] != 'a') {
>   					fprintf(stderr, "encounter an error: offset %ld\n",
>   						i);
>   					goto err;
>   				}
>   			}
>   		}
>   	}
> 
>   err:
>   	free(wbuf);
>   	free(rbuf);
> 
>   	return 0;
>   }
> 
> build & run:
>   $ gcc code.c -o code -lpthread # build
>   $ ./code ${filename} # run
> 
> As we expected, we should read nothing or data with 'a'.  But now we
> read data with '0'.  I take a closer look at the code and it seems that
> there is a bug in vfs.  Let me describe my found here.
> 
>   reader					writer
>                                                 generic_file_aio_write()
>                                                 ->__generic_file_aio_write()
>                                                   ->generic_file_direct_write()
>   generic_file_aio_read()
>   ->do_generic_file_read()
>     [fallback to buffered read]
> 
>     ->find_get_page()
>     ->page_cache_sync_readahead()
>     ->find_get_page()
>     [in find_page label, we couldn't find a
>      page before and after calling
>      page_cache_sync_readahead().  So go to
>      no_cached_page label]
> 
>     ->page_cache_alloc_cold()
>     ->add_to_page_cache_lru()
>     [in no_cached_page label, we alloc a page
>      and goto readpage label.]
> 
>     ->aops->readpage()
>     [in readpage label, readpage() callback
>      is called and mpage_readpage() return a
>      zero-filled page (e.g. ext3/4), and go
>      to page_ok label]
> 
>                                                   ->a_ops->direct_IO()
>                                                   ->i_size_write()
>                                                   [we enlarge the i_size]
> 
>     Here we check i_size
>     [in page_ok label, we check i_size but
>      it has been enlarged.  Thus, we pass
>      the check and return a zero-filled page]
> 
> This commit first trims desc->count to avoid to read too much data if
> (*ppos + desc->count) >= i_size.  Then we will return directly if
> desc->count == 0.  After doing that, we could fix the above problem.
> Meanwhile this also can fix the problem that the reader does some
> buffered reads with append dio write.
> 
> Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx>
> Cc: Dmitry Monakhov <dmonakhov@xxxxxxxxxx>
> Cc: Dave Chinner <david@xxxxxxxxxxxxx>
> Cc: Jan Kara <jack@xxxxxxx>
> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
> Signed-off-by: Zheng Liu <wenqing.lz@xxxxxxxxxx>
  The patch looks good. You can add:
Reviewed-by: Jan Kara <jack@xxxxxxx>

								Honza

> ---
> changelog v2:
>  * Another approach to solve this problem
> 
>  mm/filemap.c |    9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1e6aec4..eb55539 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1109,18 +1109,25 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
>  	pgoff_t prev_index;
>  	unsigned long offset;      /* offset into pagecache page */
>  	unsigned int prev_offset;
> +	loff_t isize;
>  	int error;
>  
> +	/* we need to trim desc->count to avoid expose stale data to user */
> +	isize = i_size_read(inode);
> +	if (*ppos + desc->count >= isize)
> +		desc->count = isize - *ppos;
>  	index = *ppos >> PAGE_CACHE_SHIFT;
>  	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
>  	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
>  	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
>  	offset = *ppos & ~PAGE_CACHE_MASK;
>  
> +	if (desc->count == 0)
> +		goto out;
> +
>  	for (;;) {
>  		struct page *page;
>  		pgoff_t end_index;
> -		loff_t isize;
>  		unsigned long nr, ret;
>  
>  		cond_resched();
> -- 
> 1.7.9.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux