Re: [PATCH 6/6] Btrfs: do aio_write instead of write

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



at 2010-5-22 1:03, Josef Bacik wrote:
> In order for AIO to work, we need to implement aio_write.  This patch converts
> our btrfs_file_write to btrfs_aio_write.  I've tested this with xfstests and
> nothing broke, and the AIO stuff magically started working.  Thanks,

But xfstests's case 198(source: src/aio-dio-regress/aiodio_sparse2.c) still failed, 
following message outputted.
--------------------
AIO write offset 0 expected 65536 got -22
AIO write offset 5242880 expected 65536 got -22
AIO write offset 10485760 expected 65536 got -22
AIO write offset 15728640 expected 65536 got -22
AIO write offset 20971520 expected 65536 got -22
AIO write offset 31457280 expected 65536 got -22
AIO write offset 36700160 expected 65536 got -22
AIO write offset 41943040 expected 65536 got -22
AIO write offset 47185920 expected 65536 got -22
AIO write offset 52428800 expected 65536 got -22
AIO write offset 57671680 expected 65536 got -22
AIO write offset 62914560 expected 65536 got -22
AIO write offset 73400320 expected 65536 got -22
AIO write offset 78643200 expected 65536 got -22
non one buffer at buf[0] => 0x00,00,00,00
non-one read at offset 0
*** WARNING *** /tmp/aaaa has not been unlinked; if you don't rm it manually first, it may influence the next run
-------------------- 

generic_file_direct_write()(in btrfs_file_aio_write(), fs/btrfs/file.c) returned -22, 
maybe it's useful for your analysing.

Thanks.

> 
> Signed-off-by: Josef Bacik <josef@xxxxxxxxxx>
> ---
>  fs/btrfs/extent_io.c |   11 +++-
>  fs/btrfs/file.c      |  152 +++++++++++++++++++++++---------------------------
>  2 files changed, 80 insertions(+), 83 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d2d0368..c407f1c 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -2020,6 +2020,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
>  	sector_t sector;
>  	struct extent_map *em;
>  	struct block_device *bdev;
> +	struct btrfs_ordered_extent *ordered;
>  	int ret;
>  	int nr = 0;
>  	size_t page_offset = 0;
> @@ -2031,7 +2032,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
>  	set_page_extent_mapped(page);
>  
>  	end = page_end;
> -	lock_extent(tree, start, end, GFP_NOFS);
> +	while (1) {
> +		lock_extent(tree, start, end, GFP_NOFS);
> +		ordered = btrfs_lookup_ordered_extent(inode, start);
> +		if (!ordered)
> +			break;
> +		unlock_extent(tree, start, end, GFP_NOFS);
> +		btrfs_start_ordered_extent(inode, ordered, 1);
> +		btrfs_put_ordered_extent(ordered);
> +	}
>  
>  	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
>  		char *userpage;
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index dace07b..ce35431 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -46,32 +46,42 @@
>  static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
>  					 int write_bytes,
>  					 struct page **prepared_pages,
> -					 const char __user *buf)
> +					 struct iov_iter *i)
>  {
> -	long page_fault = 0;
> -	int i;
> +	size_t copied;
> +	int pg = 0;
>  	int offset = pos & (PAGE_CACHE_SIZE - 1);
>  
> -	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
> +	while (write_bytes > 0) {
>  		size_t count = min_t(size_t,
>  				     PAGE_CACHE_SIZE - offset, write_bytes);
> -		struct page *page = prepared_pages[i];
> -		fault_in_pages_readable(buf, count);
> +		struct page *page = prepared_pages[pg];
> +again:
> +		if (unlikely(iov_iter_fault_in_readable(i, count)))
> +			return -EFAULT;
>  
>  		/* Copy data from userspace to the current page */
> -		kmap(page);
> -		page_fault = __copy_from_user(page_address(page) + offset,
> -					      buf, count);
> +		copied = iov_iter_copy_from_user(page, i, offset, count);
> +
>  		/* Flush processor's dcache for this page */
>  		flush_dcache_page(page);
> -		kunmap(page);
> -		buf += count;
> -		write_bytes -= count;
> +		iov_iter_advance(i, copied);
> +		write_bytes -= copied;
>  
> -		if (page_fault)
> -			break;
> +		if (unlikely(copied == 0)) {
> +			count = min_t(size_t, PAGE_CACHE_SIZE - offset,
> +				      iov_iter_single_seg_count(i));
> +			goto again;
> +		}
> +
> +		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
> +			offset += copied;
> +		} else {
> +			pg++;
> +			offset = 0;
> +		}
>  	}
> -	return page_fault ? -EFAULT : 0;
> +	return 0;
>  }
>  
>  /*
> @@ -823,60 +833,24 @@ again:
>  	return 0;
>  }
>  
> -/* Copied from read-write.c */
> -static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
> -{
> -	set_current_state(TASK_UNINTERRUPTIBLE);
> -	if (!kiocbIsKicked(iocb))
> -		schedule();
> -	else
> -		kiocbClearKicked(iocb);
> -	__set_current_state(TASK_RUNNING);
> -}
> -
> -/*
> - * Just a copy of what do_sync_write does.
> - */
> -static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf,
> -				    size_t count, loff_t pos, loff_t *ppos)
> -{
> -	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
> -	unsigned long nr_segs = 1;
> -	struct kiocb kiocb;
> -	ssize_t ret;
> -
> -	init_sync_kiocb(&kiocb, file);
> -	kiocb.ki_pos = pos;
> -	kiocb.ki_left = count;
> -	kiocb.ki_nbytes = count;
> -
> -	while (1) {
> -		ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos,
> -						ppos, count, count);
> -		if (ret != -EIOCBRETRY)
> -			break;
> -		wait_on_retry_sync_kiocb(&kiocb);
> -	}
> -
> -	if (ret == -EIOCBQUEUED)
> -		ret = wait_on_sync_kiocb(&kiocb);
> -	*ppos = kiocb.ki_pos;
> -	return ret;
> -}
> -
> -static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
> -				size_t count, loff_t *ppos)
> +static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
> +				    const struct iovec *iov,
> +				    unsigned long nr_segs, loff_t pos)
>  {
> -	loff_t pos;
> +	struct file *file = iocb->ki_filp;
> +	struct inode *inode = fdentry(file)->d_inode;
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct page *pinned[2];
> +	struct page **pages = NULL;
> +	struct iov_iter i;
> +	loff_t *ppos = &iocb->ki_pos;
>  	loff_t start_pos;
>  	ssize_t num_written = 0;
>  	ssize_t err = 0;
> +	size_t count;
> +	size_t ocount;
>  	int ret = 0;
> -	struct inode *inode = fdentry(file)->d_inode;
> -	struct btrfs_root *root = BTRFS_I(inode)->root;
> -	struct page **pages = NULL;
>  	int nrptrs;
> -	struct page *pinned[2];
>  	unsigned long first_index;
>  	unsigned long last_index;
>  	int will_write;
> @@ -888,7 +862,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  	pinned[0] = NULL;
>  	pinned[1] = NULL;
>  
> -	pos = *ppos;
>  	start_pos = pos;
>  
>  	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> @@ -902,6 +875,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  
>  	mutex_lock(&inode->i_mutex);
>  
> +	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
> +	if (err)
> +		goto out;
> +	count = ocount;
> +
>  	current->backing_dev_info = inode->i_mapping->backing_dev_info;
>  	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
>  	if (err)
> @@ -918,14 +896,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  	BTRFS_I(inode)->sequence++;
>  
>  	if (unlikely(file->f_flags & O_DIRECT)) {
> -		num_written = __btrfs_direct_write(file, buf, count, pos,
> -						   ppos);
> -		pos += num_written;
> -		count -= num_written;
> +		ret = btrfs_check_data_free_space(root, inode, count);
> +		if (ret)
> +			goto out;
>  
> -		/* We've written everything we wanted to, exit */
> -		if (num_written < 0 || !count)
> +		num_written = generic_file_direct_write(iocb, iov, &nr_segs,
> +							pos, ppos, count,
> +							ocount);
> +
> +		/* All reservations for DIO are done internally */
> +		btrfs_free_reserved_data_space(root, inode, count);
> +
> +		if (num_written < 0) {
> +			ret = num_written;
> +			num_written = 0;
> +			goto out;
> +		} else if (num_written == count) {
>  			goto out;
> +		}
>  
>  		/*
>  		 * We are going to do buffered for the rest of the range, so we
> @@ -933,18 +921,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  		 * done.
>  		 */
>  		buffered = 1;
> -		buf += num_written;
> +		pos += num_written;
>  	}
>  
> -	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
> -		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
> +	iov_iter_init(&i, iov, nr_segs, count, num_written);
> +	nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
> +		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
> +		     (sizeof(struct page *)));
>  	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
>  
>  	/* generic_write_checks can change our pos */
>  	start_pos = pos;
>  
>  	first_index = pos >> PAGE_CACHE_SHIFT;
> -	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
> +	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
>  
>  	/*
>  	 * there are lots of better ways to do this, but this code
> @@ -961,7 +951,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  			unlock_page(pinned[0]);
>  		}
>  	}
> -	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
> +	if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
>  		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
>  		if (!PageUptodate(pinned[1])) {
>  			ret = btrfs_readpage(NULL, pinned[1]);
> @@ -972,10 +962,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  		}
>  	}
>  
> -	while (count > 0) {
> +	while (iov_iter_count(&i) > 0) {
>  		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
> -		size_t write_bytes = min(count, nrptrs *
> -					(size_t)PAGE_CACHE_SIZE -
> +		size_t write_bytes = min(iov_iter_count(&i),
> +					 nrptrs * (size_t)PAGE_CACHE_SIZE -
>  					 offset);
>  		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
>  					PAGE_CACHE_SHIFT;
> @@ -997,7 +987,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  		}
>  
>  		ret = btrfs_copy_from_user(pos, num_pages,
> -					   write_bytes, pages, buf);
> +					   write_bytes, pages, &i);
>  		if (ret) {
>  			btrfs_free_reserved_data_space(root, inode,
>  						       write_bytes);
> @@ -1026,8 +1016,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
>  			btrfs_throttle(root);
>  		}
>  
> -		buf += write_bytes;
> -		count -= write_bytes;
>  		pos += write_bytes;
>  		num_written += write_bytes;
>  
> @@ -1222,7 +1210,7 @@ const struct file_operations btrfs_file_operations = {
>  	.read		= do_sync_read,
>  	.aio_read       = generic_file_aio_read,
>  	.splice_read	= generic_file_splice_read,
> -	.write		= btrfs_file_write,
> +	.aio_write	= btrfs_file_aio_write,
>  	.mmap		= btrfs_file_mmap,
>  	.open		= generic_file_open,
>  	.release	= btrfs_release_file,

-- 
Shi Weihua
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux