at 2010-5-22 1:03, Josef Bacik wrote: > In order for AIO to work, we need to implement aio_write. This patch converts > our btrfs_file_write to btrfs_aio_write. I've tested this with xfstests and > nothing broke, and the AIO stuff magically started working. Thanks, But xfstests's case 198(source: src/aio-dio-regress/aiodio_sparse2.c) still failed, following message outputted. -------------------- AIO write offset 0 expected 65536 got -22 AIO write offset 5242880 expected 65536 got -22 AIO write offset 10485760 expected 65536 got -22 AIO write offset 15728640 expected 65536 got -22 AIO write offset 20971520 expected 65536 got -22 AIO write offset 31457280 expected 65536 got -22 AIO write offset 36700160 expected 65536 got -22 AIO write offset 41943040 expected 65536 got -22 AIO write offset 47185920 expected 65536 got -22 AIO write offset 52428800 expected 65536 got -22 AIO write offset 57671680 expected 65536 got -22 AIO write offset 62914560 expected 65536 got -22 AIO write offset 73400320 expected 65536 got -22 AIO write offset 78643200 expected 65536 got -22 non one buffer at buf[0] => 0x00,00,00,00 non-one read at offset 0 *** WARNING *** /tmp/aaaa has not been unlinked; if you don't rm it manually first, it may influence the next run -------------------- generic_file_direct_write()(in btrfs_file_aio_write(), fs/btrfs/file.c) returned -22, maybe it's useful for your analysing. Thanks. > > Signed-off-by: Josef Bacik <josef@xxxxxxxxxx> > --- > fs/btrfs/extent_io.c | 11 +++- > fs/btrfs/file.c | 152 +++++++++++++++++++++++--------------------------- > 2 files changed, 80 insertions(+), 83 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index d2d0368..c407f1c 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -2020,6 +2020,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, > sector_t sector; > struct extent_map *em; > struct block_device *bdev; > + struct btrfs_ordered_extent *ordered; > int ret; > int nr = 0; > size_t page_offset = 0; > @@ -2031,7 +2032,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, > set_page_extent_mapped(page); > > end = page_end; > - lock_extent(tree, start, end, GFP_NOFS); > + while (1) { > + lock_extent(tree, start, end, GFP_NOFS); > + ordered = btrfs_lookup_ordered_extent(inode, start); > + if (!ordered) > + break; > + unlock_extent(tree, start, end, GFP_NOFS); > + btrfs_start_ordered_extent(inode, ordered, 1); > + btrfs_put_ordered_extent(ordered); > + } > > if (page->index == last_byte >> PAGE_CACHE_SHIFT) { > char *userpage; > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index dace07b..ce35431 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -46,32 +46,42 @@ > static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, > int write_bytes, > struct page **prepared_pages, > - const char __user *buf) > + struct iov_iter *i) > { > - long page_fault = 0; > - int i; > + size_t copied; > + int pg = 0; > int offset = pos & (PAGE_CACHE_SIZE - 1); > > - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { > + while (write_bytes > 0) { > size_t count = min_t(size_t, > PAGE_CACHE_SIZE - offset, write_bytes); > - struct page *page = prepared_pages[i]; > - fault_in_pages_readable(buf, count); > + struct page *page = prepared_pages[pg]; > +again: > + if (unlikely(iov_iter_fault_in_readable(i, count))) > + return -EFAULT; > > /* Copy data from userspace to the current page */ > - kmap(page); > - page_fault = __copy_from_user(page_address(page) + offset, > - buf, count); > + copied = iov_iter_copy_from_user(page, i, offset, count); > + > /* Flush processor's dcache for this page */ > flush_dcache_page(page); > - kunmap(page); > - buf += count; > - write_bytes -= count; > + iov_iter_advance(i, copied); > + write_bytes -= copied; > > - if (page_fault) > - break; > + if (unlikely(copied == 0)) { > + count = min_t(size_t, PAGE_CACHE_SIZE - offset, > + iov_iter_single_seg_count(i)); > + goto again; > + } > + > + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { > + offset += copied; > + } else { > + pg++; > + offset = 0; > + } > } > - return page_fault ? -EFAULT : 0; > + return 0; > } > > /* > @@ -823,60 +833,24 @@ again: > return 0; > } > > -/* Copied from read-write.c */ > -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) > -{ > - set_current_state(TASK_UNINTERRUPTIBLE); > - if (!kiocbIsKicked(iocb)) > - schedule(); > - else > - kiocbClearKicked(iocb); > - __set_current_state(TASK_RUNNING); > -} > - > -/* > - * Just a copy of what do_sync_write does. > - */ > -static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf, > - size_t count, loff_t pos, loff_t *ppos) > -{ > - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; > - unsigned long nr_segs = 1; > - struct kiocb kiocb; > - ssize_t ret; > - > - init_sync_kiocb(&kiocb, file); > - kiocb.ki_pos = pos; > - kiocb.ki_left = count; > - kiocb.ki_nbytes = count; > - > - while (1) { > - ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos, > - ppos, count, count); > - if (ret != -EIOCBRETRY) > - break; > - wait_on_retry_sync_kiocb(&kiocb); > - } > - > - if (ret == -EIOCBQUEUED) > - ret = wait_on_sync_kiocb(&kiocb); > - *ppos = kiocb.ki_pos; > - return ret; > -} > - > -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > - size_t count, loff_t *ppos) > +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, > + const struct iovec *iov, > + unsigned long nr_segs, loff_t pos) > { > - loff_t pos; > + struct file *file = iocb->ki_filp; > + struct inode *inode = fdentry(file)->d_inode; > + struct btrfs_root *root = BTRFS_I(inode)->root; > + struct page *pinned[2]; > + struct page **pages = NULL; > + struct iov_iter i; > + loff_t *ppos = &iocb->ki_pos; > loff_t start_pos; > ssize_t num_written = 0; > ssize_t err = 0; > + size_t count; > + size_t ocount; > int ret = 0; > - struct inode *inode = fdentry(file)->d_inode; > - struct btrfs_root *root = BTRFS_I(inode)->root; > - struct page **pages = NULL; > int nrptrs; > - struct page *pinned[2]; > unsigned long first_index; > unsigned long last_index; > int will_write; > @@ -888,7 +862,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > pinned[0] = NULL; > pinned[1] = NULL; > > - pos = *ppos; > start_pos = pos; > > vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); > @@ -902,6 +875,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > > mutex_lock(&inode->i_mutex); > > + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); > + if (err) > + goto out; > + count = ocount; > + > current->backing_dev_info = inode->i_mapping->backing_dev_info; > err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); > if (err) > @@ -918,14 +896,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > BTRFS_I(inode)->sequence++; > > if (unlikely(file->f_flags & O_DIRECT)) { > - num_written = __btrfs_direct_write(file, buf, count, pos, > - ppos); > - pos += num_written; > - count -= num_written; > + ret = btrfs_check_data_free_space(root, inode, count); > + if (ret) > + goto out; > > - /* We've written everything we wanted to, exit */ > - if (num_written < 0 || !count) > + num_written = generic_file_direct_write(iocb, iov, &nr_segs, > + pos, ppos, count, > + ocount); > + > + /* All reservations for DIO are done internally */ > + btrfs_free_reserved_data_space(root, inode, count); > + > + if (num_written < 0) { > + ret = num_written; > + num_written = 0; > + goto out; > + } else if (num_written == count) { > goto out; > + } > > /* > * We are going to do buffered for the rest of the range, so we > @@ -933,18 +921,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > * done. > */ > buffered = 1; > - buf += num_written; > + pos += num_written; > } > > - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, > - PAGE_CACHE_SIZE / (sizeof(struct page *))); > + iov_iter_init(&i, iov, nr_segs, count, num_written); > + nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / > + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / > + (sizeof(struct page *))); > pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); > > /* generic_write_checks can change our pos */ > start_pos = pos; > > first_index = pos >> PAGE_CACHE_SHIFT; > - last_index = (pos + count) >> PAGE_CACHE_SHIFT; > + last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; > > /* > * there are lots of better ways to do this, but this code > @@ -961,7 +951,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > unlock_page(pinned[0]); > } > } > - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { > + if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { > pinned[1] = grab_cache_page(inode->i_mapping, last_index); > if (!PageUptodate(pinned[1])) { > ret = btrfs_readpage(NULL, pinned[1]); > @@ -972,10 +962,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > } > } > > - while (count > 0) { > + while (iov_iter_count(&i) > 0) { > size_t offset = pos & (PAGE_CACHE_SIZE - 1); > - size_t write_bytes = min(count, nrptrs * > - (size_t)PAGE_CACHE_SIZE - > + size_t write_bytes = min(iov_iter_count(&i), > + nrptrs * (size_t)PAGE_CACHE_SIZE - > offset); > size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> > PAGE_CACHE_SHIFT; > @@ -997,7 +987,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > } > > ret = btrfs_copy_from_user(pos, num_pages, > - write_bytes, pages, buf); > + write_bytes, pages, &i); > if (ret) { > btrfs_free_reserved_data_space(root, inode, > write_bytes); > @@ -1026,8 +1016,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, > btrfs_throttle(root); > } > > - buf += write_bytes; > - count -= write_bytes; > pos += write_bytes; > num_written += write_bytes; > > @@ -1222,7 +1210,7 @@ const struct file_operations btrfs_file_operations = { > .read = do_sync_read, > .aio_read = generic_file_aio_read, > .splice_read = generic_file_splice_read, > - .write = btrfs_file_write, > + .aio_write = btrfs_file_aio_write, > .mmap = btrfs_file_mmap, > .open = generic_file_open, > .release = btrfs_release_file, -- Shi Weihua -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html