From: Peng Tao <bergwolf@xxxxxxxxx> Do not call pnfs_update_layout at write_begin, and rewrite bl_write_pagelist to handle EOF there. Signed-off-by: Peng Tao <peng_tao@xxxxxxx> --- fs/nfs/blocklayout/blocklayout.c | 652 ++++++++++++++++++-------------------- fs/nfs/file.c | 26 +-- fs/nfs/pnfs.c | 41 --- fs/nfs/pnfs.h | 115 ------- fs/nfs/write.c | 12 +- include/linux/nfs_fs.h | 3 +- 6 files changed, 321 insertions(+), 528 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 8531fd7..331d687 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -32,8 +32,8 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/buffer_head.h> /* various write calls */ -#include <linux/bio.h> /* struct bio */ +#include <linux/buffer_head.h> /* various write calls */ +#include <linux/bio.h> /* struct bio */ #include <linux/vmalloc.h> #include "blocklayout.h" @@ -75,16 +75,11 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect) */ static int is_writable(struct pnfs_block_extent *be, sector_t isect) { - if (be->be_state == PNFS_BLOCK_READWRITE_DATA) - return 1; - else if (be->be_state != PNFS_BLOCK_INVALID_DATA) - return 0; - else - return is_sector_initialized(be->be_inval, isect); + return (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA); } -static int -dont_like_caller(struct nfs_page *req) +static int dont_like_caller(struct nfs_page *req) { if (atomic_read(&req->wb_complete)) { /* Called by _multi */ @@ -109,7 +104,7 @@ static inline struct parallel_io *alloc_parallel(void *data) { struct parallel_io *rv; - rv = kmalloc(sizeof(*rv), GFP_KERNEL); + rv = kmalloc(sizeof(*rv), GFP_KERNEL); if (rv) { rv->data = data; kref_init(&rv->refcnt); @@ -136,21 +131,19 @@ static inline void put_parallel(struct parallel_io *p) kref_put(&p->refcnt, destroy_parallel); } -static struct bio * -bl_submit_bio(int rw, struct bio *bio) +static struct bio *bl_submit_bio(int rw, struct bio *bio) { if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, rw == READ ? "read" : "write", - bio->bi_size, (u64)bio->bi_sector); + bio->bi_size, (u64) bio->bi_sector); submit_bio(rw, bio); } return NULL; } -static inline void -bl_done_with_rpage(struct page *page, const int ok) +static inline void bl_done_with_rpage(struct page *page, const int ok) { if (ok) { ClearPagePnfsErr(page); @@ -191,8 +184,7 @@ static void bl_read_cleanup(struct work_struct *work) pnfs_ld_read_done(rdata); } -static void -bl_end_par_io_read(void *data) +static void bl_end_par_io_read(void *data) { struct nfs_read_data *rdata = data; @@ -208,8 +200,7 @@ static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) return; } -static enum pnfs_try_status -bl_read_pagelist(struct nfs_read_data *rdata) +static enum pnfs_try_status bl_read_pagelist(struct nfs_read_data *rdata) { int i, hole; struct bio *bio = NULL; @@ -222,7 +213,7 @@ bl_read_pagelist(struct nfs_read_data *rdata) int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, - rdata->npages, f_offset, count); + rdata->npages, f_offset, count); if (dont_like_caller(rdata->req)) { dprintk("%s dont_like_caller failed\n", __func__); @@ -260,10 +251,10 @@ bl_read_pagelist(struct nfs_read_data *rdata) break; } extent_length = be->be_length - - (isect - be->be_f_offset); + (isect - be->be_f_offset); if (cow_read) { sector_t cow_length = cow_read->be_length - - (isect - cow_read->be_f_offset); + (isect - cow_read->be_f_offset); extent_length = min(extent_length, cow_length); } } @@ -282,15 +273,17 @@ bl_read_pagelist(struct nfs_read_data *rdata) be_read = (hole && cow_read) ? cow_read : be; for (;;) { if (!bio) { - bio = bio_alloc(GFP_NOIO, rdata->npages - i); + bio = + bio_alloc(GFP_NOIO, + rdata->npages - i); if (!bio) { /* Error out this page */ bl_done_with_rpage(pages[i], 0); break; } bio->bi_sector = isect - - be_read->be_f_offset + - be_read->be_v_offset; + be_read->be_f_offset + + be_read->be_v_offset; bio->bi_bdev = be_read->be_mdev; bio->bi_end_io = bl_end_io_read; bio->bi_private = par; @@ -315,7 +308,7 @@ bl_read_pagelist(struct nfs_read_data *rdata) put_parallel(par); return PNFS_ATTEMPTED; - use_mds: +use_mds: dprintk("Giving up and using normal NFS\n"); return PNFS_NOT_ATTEMPTED; } @@ -335,18 +328,16 @@ static void mark_extents_written(struct pnfs_block_layout *bl, while (isect < end) { sector_t len; be = find_get_extent(bl, isect, NULL); - BUG_ON(!be); /* FIXME */ + BUG_ON(!be); /* FIXME */ len = min(end, be->be_f_offset + be->be_length) - isect; if (be->be_state == PNFS_BLOCK_INVALID_DATA) - mark_for_commit(be, isect, len); /* What if fails? */ + mark_for_commit(be, isect, len); /* What if fails? */ isect += len; put_extent(be); } } -/* STUB - this needs thought */ -static inline void -bl_done_with_wpage(struct page *page, const int ok) +static inline void bl_done_with_wpage(struct page *page, const int ok) { if (!ok) { SetPageError(page); @@ -360,15 +351,37 @@ bl_done_with_wpage(struct page *page, const int ok) spin_unlock(&inode->i_lock); } } - /* end_page_writeback called in rpc_release. Should be done here. */ } -/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_write_zero(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + bl_done_with_wpage(page, uptodate); + /* This is the zeroing page we added */ + end_page_writeback(page); + page_cache_release(page); + } while (bvec >= bio->bi_io_vec); + if (!uptodate && !wdata->pnfs_error) + wdata->pnfs_error = -EIO; + bio_put(bio); + put_parallel(par); +} + static void bl_end_io_write(struct bio *bio, int err) { - void *data = bio->bi_private; + struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; do { struct page *page = bvec->bv_page; @@ -377,8 +390,10 @@ static void bl_end_io_write(struct bio *bio, int err) prefetchw(&bvec->bv_page->flags); bl_done_with_wpage(page, uptodate); } while (bvec >= bio->bi_io_vec); + if (!uptodate && !wdata->pnfs_error) + wdata->pnfs_error = -EIO; bio_put(bio); - put_parallel(data); + put_parallel(par); } /* Function scheduled for call during bl_end_par_io_write, @@ -391,7 +406,7 @@ static void bl_write_cleanup(struct work_struct *work) dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); wdata = container_of(task, struct nfs_write_data, task); - if (!wdata->task.tk_status) { + if (!wdata->pnfs_error) { /* Marks for LAYOUTCOMMIT */ /* BUG - this should be called after each bio, not after * all finish, unless have some way of storing success/failure @@ -403,8 +418,7 @@ static void bl_write_cleanup(struct work_struct *work) } /* Called when last of bios associated with a bl_write_pagelist call finishes */ -static void -bl_end_par_io_write(void *data) +static void bl_end_par_io_write(void *data) { struct nfs_write_data *wdata = data; @@ -415,19 +429,118 @@ bl_end_par_io_write(void *data) schedule_work(&wdata->task.u.tk_work); } +/* STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ + return; +} + +/* Copied from buffer.c */ +static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); +} + +/* Copied from buffer.c */ +static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); +} + +/* + * map_block: map a requested I/0 block (isect) into an offset in the LVM + * meta block_device + */ +static void +map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) +{ + dprintk("%s enter be=%p\n", __func__, be); + + set_buffer_mapped(bh); + bh->b_bdev = be->be_mdev; + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> + (be->be_mdev->bd_inode->i_blkbits - 9); + + dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", + __func__, (long)isect, (long)bh->b_blocknr, bh->b_size); + return; +} + +/* Given an unmapped page, zero it (or read in page for COW), + */ +static int +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +{ + struct buffer_head *bh = NULL; + int ret = 0; + sector_t isect; + + dprintk("%s enter, %p\n", __func__, page); + BUG_ON(PageUptodate(page)); + /* not COW, zero and return */ + if (!cow_read) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + goto cleanup; + } + + /* COW, readin page */ + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); + if (!bh) { + ret = -ENOMEM; + goto cleanup; + } + + isect = (sector_t) page->index << (PAGE_CACHE_SHIFT - 9); + map_block(isect, cow_read, bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_read_nobh; + submit_bh(READ, bh); + dprintk("%s: Waiting for buffer read\n", __func__); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ret = -EIO; + goto cleanup; + } + SetPageUptodate(page); + SetPageMappedToDisk(page); + +cleanup: + put_extent(cow_read); + if (bh) + free_buffer_head(bh); + if (ret) { + /* Need to mark layout with bad read...should now + * just use nfs4 for reads and writes. + */ + mark_bad_read(); + } + return ret; +} + static enum pnfs_try_status -bl_write_pagelist(struct nfs_write_data *wdata, - int sync) +bl_write_pagelist(struct nfs_write_data *wdata, int sync) { - int i; + int i, ret, npg_zero, pg_index, last = 0; struct bio *bio = NULL; - struct pnfs_block_extent *be = NULL; - sector_t isect, extent_length = 0; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, last_isect = 0, extent_length = 0; struct parallel_io *par; loff_t offset = wdata->args.offset; size_t count = wdata->args.count; struct page **pages = wdata->args.pages; - int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; + struct page *page; + pgoff_t index; + int npg_per_block = + NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); if (!wdata->lseg) { @@ -439,11 +552,8 @@ bl_write_pagelist(struct nfs_write_data *wdata, return PNFS_NOT_ATTEMPTED; } /* At this point, wdata->pages is a (sequential) list of nfs_pages. - * We want to write each, and if there is an error remove it from - * list and call - * nfs_retry_request(req) to have it redone using nfs. - * QUEST? Do as block or per req? Think have to do per block - * as part of end_bio + * We want to write each, and if there is an error set pnfs_error + * to have it redone using nfs. */ par = alloc_parallel(wdata); if (!par) @@ -454,7 +564,106 @@ bl_write_pagelist(struct nfs_write_data *wdata, /* At this point, have to be more careful with error handling */ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); - for (i = pg_index; i < wdata->npages ; i++) { + be = find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); + if (!be || !is_writable(be, isect)) { + dprintk("%s no matching extents!\n", __func__); + wdata->pnfs_error = -EINVAL; + goto out; + } + + /* First page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + npg_zero = (offset >> PAGE_CACHE_SHIFT) % npg_per_block; + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & + (long)PAGE_CACHE_MASK) >> 9); + extent_length = be->be_length - (isect - be->be_f_offset); + +fill_invalid_ext: + dprintk("%s need to zero %d pages\n", __func__, npg_zero); + while (npg_zero) { + /* page ref released in bl_end_io_write_zero */ + index = isect >> (PAGE_CACHE_SHIFT - 9); + dprintk("%s zero %dth page: index %lu isect %lu\n", + __func__, npg_zero, index, isect); + page = + find_or_create_page(wdata->inode->i_mapping, index, + GFP_NOFS); + if (!page) { + dprintk("%s oom\n", __func__); + wdata->pnfs_error = -ENOMEM; + goto out; + } + + /* PageDirty: Other will write this out + * PageWriteback: Other is writing this out + * PageUptodate && sector_initialized: already written out + */ + if (PageDirty(page) || PageWriteback(page) || + (PageUptodate(page) && + is_sector_initialized(be->be_inval, isect))) { + dprintk + ("page %lu is uptodate %d dirty %d writeback %d\n", + page->index, PageUptodate(page), + PageDirty(page), PageWriteback(page)); + unlock_page(page); + page_cache_release(page); + npg_zero--; + isect += PAGE_CACHE_SIZE >> 9; + extent_length -= PAGE_CACHE_SIZE >> 9; + continue; + } + if (!PageUptodate(page) && cow_read) { + /* New page, readin or zero it */ + init_page_for_write(page, cow_read); + } + set_page_writeback(page); + unlock_page(page); + + ret = mark_initialized_sectors(be->be_inval, isect, + PAGE_CACHE_SECTORS, + NULL); + if (unlikely(ret)) { + dprintk("%s mark_initialized_sectors fail %d\n", + __func__, ret); + page_cache_release(page); + wdata->pnfs_error = ret; + goto out; + } + for (;;) { + if (!bio) { + bio = bio_alloc(GFP_NOIO, npg_zero--); + if (unlikely(!bio)) { + end_page_writeback(page); + page_cache_release(page); + wdata->pnfs_error = -ENOMEM; + goto out; + } + bio->bi_sector = + isect - be->be_f_offset + + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = bl_end_io_write_zero; + bio->bi_private = par; + } + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0)) + break; + bio = bl_submit_bio(WRITE, bio); + } + /* FIXME: This should be done in bi_end_io */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE); + isect += PAGE_CACHE_SIZE >> 9; + extent_length -= PAGE_CACHE_SIZE >> 9; + } + if (last) + goto write_done; + } + bio = bl_submit_bio(WRITE, bio); + + /* Middle pages */ + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; + for (i = pg_index; i < wdata->npages; i++) { if (!extent_length) { /* We've used up the previous extent */ put_extent(be); @@ -463,24 +672,32 @@ bl_write_pagelist(struct nfs_write_data *wdata, be = find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, NULL); if (!be || !is_writable(be, isect)) { - /* FIXME */ - bl_done_with_wpage(pages[i], 0); - break; + wdata->pnfs_error = -EINVAL; + goto out; } extent_length = be->be_length - - (isect - be->be_f_offset); + (isect - be->be_f_offset); + } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + ret = mark_initialized_sectors(be->be_inval, isect, + PAGE_CACHE_SECTORS, + NULL); + if (unlikely(ret)) { + dprintk("%s mark_initialized_sectors fail %d\n", + __func__, ret); + wdata->pnfs_error = ret; + goto out; + } } for (;;) { if (!bio) { bio = bio_alloc(GFP_NOIO, wdata->npages - i); if (!bio) { - /* Error out this page */ - /* FIXME */ - bl_done_with_wpage(pages[i], 0); - break; + wdata->pnfs_error = -ENOMEM; + goto out; } bio->bi_sector = isect - be->be_f_offset + - be->be_v_offset; + be->be_v_offset; bio->bi_bdev = be->be_mdev; bio->bi_end_io = bl_end_io_write; bio->bi_private = par; @@ -490,11 +707,27 @@ bl_write_pagelist(struct nfs_write_data *wdata, bio = bl_submit_bio(WRITE, bio); } isect += PAGE_CACHE_SIZE >> 9; + last_isect = isect; extent_length -= PAGE_CACHE_SIZE >> 9; } - wdata->res.count = (isect << 9) - (offset); - if (count < wdata->res.count) + + /* Last page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + bio = bl_submit_bio(WRITE, bio); + npg_zero = npg_per_block - + (last_isect >> (PAGE_CACHE_SHIFT - 9)) % npg_per_block; + if (npg_zero < npg_per_block) { + last = 1; + goto fill_invalid_ext; + } + } + +write_done: + wdata->res.count = (last_isect << 9) - (offset); + if (count < wdata->res.count) { wdata->res.count = count; + } +out: put_extent(be); bl_submit_bio(WRITE, bio); put_parallel(par); @@ -521,8 +754,7 @@ release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) spin_unlock(&bl->bl_ext_lock); } -static void -release_inval_marks(struct pnfs_inval_markings *marks) +static void release_inval_marks(struct pnfs_inval_markings *marks) { struct pnfs_inval_tracking *pos, *temp; @@ -615,7 +847,8 @@ bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, struct nfs4_layoutcommit_data *lcdata) { dprintk("%s enter\n", __func__); - clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, + lcdata->res.status); } static void free_blk_mountid(struct block_mount_id *mid) @@ -625,8 +858,7 @@ static void free_blk_mountid(struct block_mount_id *mid) spin_lock(&mid->bm_lock); while (!list_empty(&mid->bm_devlist)) { dev = list_first_entry(&mid->bm_devlist, - struct pnfs_block_dev, - bm_node); + struct pnfs_block_dev, bm_node); list_del(&dev->bm_node); free_block_dev(dev); } @@ -638,10 +870,11 @@ static void free_blk_mountid(struct block_mount_id *mid) /* This is mostly copied from the filelayout's get_device_info function. * It seems much of this should be at the generic pnfs level. */ -static struct pnfs_block_dev * -nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, - struct nfs4_deviceid *d_id, - struct list_head *sdlist) +static struct pnfs_block_dev *nfs4_blk_get_deviceinfo(struct nfs_server *server, + const struct nfs_fh *fh, + struct nfs4_deviceid + *d_id, + struct list_head *sdlist) { struct pnfs_device *dev; struct pnfs_block_dev *rv = NULL; @@ -695,7 +928,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, goto out_free; rv = nfs4_blk_decode_device(server, dev, sdlist); - out_free: +out_free: if (dev->area != NULL) vunmap(dev->area); for (i = 0; i < max_pages; i++) @@ -748,8 +981,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) */ for (i = 0; i < dlist->num_devs; i++) { bdev = nfs4_blk_get_deviceinfo(server, fh, - &dlist->dev_id[i], - &block_disklist); + &dlist->dev_id[i], + &block_disklist); if (!bdev) { status = -ENODEV; goto out_error; @@ -762,18 +995,17 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) dprintk("%s SUCCESS\n", __func__); server->pnfs_ld_data = b_mt_id; - out_return: +out_return: kfree(dlist); return status; - out_error: +out_error: free_blk_mountid(b_mt_id); kfree(mtype); goto out_return; } -static int -bl_clear_layoutdriver(struct nfs_server *server) +static int bl_clear_layoutdriver(struct nfs_server *server) { struct block_mount_id *b_mt_id = server->pnfs_ld_data; @@ -783,268 +1015,14 @@ bl_clear_layoutdriver(struct nfs_server *server) return 0; } -/* STUB - mark intersection of layout and page as bad, so is not - * used again. - */ -static void mark_bad_read(void) -{ - return; -} - -/* Copied from buffer.c */ -static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) -{ - if (uptodate) { - set_buffer_uptodate(bh); - } else { - /* This happens, due to failed READA attempts. */ - clear_buffer_uptodate(bh); - } - unlock_buffer(bh); -} - -/* Copied from buffer.c */ -static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) -{ - __end_buffer_read_notouch(bh, uptodate); -} - -/* - * map_block: map a requested I/0 block (isect) into an offset in the LVM - * meta block_device - */ -static void -map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) -{ - dprintk("%s enter be=%p\n", __func__, be); - - set_buffer_mapped(bh); - bh->b_bdev = be->be_mdev; - bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> - (be->be_mdev->bd_inode->i_blkbits - 9); - - dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", - __func__, (long)isect, - (long)bh->b_blocknr, - bh->b_size); - return; -} - -/* Given an unmapped page, zero it (or read in page for COW), - * and set appropriate flags/markings, but it is safe to not initialize - * the range given in [from, to). - */ -/* This is loosely based on nobh_write_begin */ -static int -init_page_for_write(struct pnfs_block_layout *bl, struct page *page, - unsigned from, unsigned to, sector_t **pages_to_mark) -{ - struct buffer_head *bh; - int inval, ret = -EIO; - struct pnfs_block_extent *be = NULL, *cow_read = NULL; - sector_t isect; - - dprintk("%s enter, %p\n", __func__, page); - bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); - if (!bh) { - ret = -ENOMEM; - goto cleanup; - } - - isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); - be = find_get_extent(bl, isect, &cow_read); - if (!be) - goto cleanup; - inval = is_hole(be, isect); - dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); - if (inval) { - if (be->be_state == PNFS_BLOCK_NONE_DATA) { - dprintk("%s PANIC - got NONE_DATA extent %p\n", - __func__, be); - goto cleanup; - } - map_block(isect, be, bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - } - if (PageUptodate(page)) { - /* Do nothing */ - } else if (inval & !cow_read) { - zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); - } else if (0 < from || PAGE_CACHE_SIZE > to) { - struct pnfs_block_extent *read_extent; - - read_extent = (inval && cow_read) ? cow_read : be; - map_block(isect, read_extent, bh); - lock_buffer(bh); - bh->b_end_io = end_buffer_read_nobh; - submit_bh(READ, bh); - dprintk("%s: Waiting for buffer read\n", __func__); - /* XXX Don't really want to hold layout lock here */ - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - goto cleanup; - } - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - /* There is a BUG here if is a short copy after write_begin, - * but I think this is a generic fs bug. The problem is that - * we have marked the page as initialized, but it is possible - * that the section not copied may never get copied. - */ - ret = mark_initialized_sectors(be->be_inval, isect, - PAGE_CACHE_SECTORS, - pages_to_mark); - /* Want to preallocate mem so above can't fail */ - if (ret) - goto cleanup; - } - SetPageMappedToDisk(page); - ret = 0; - -cleanup: - free_buffer_head(bh); - put_extent(be); - put_extent(cow_read); - if (ret) { - /* Need to mark layout with bad read...should now - * just use nfs4 for reads and writes. - */ - mark_bad_read(); - } - return ret; -} - -static int -bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, - unsigned count, struct pnfs_fsdata *fsdata) -{ - unsigned from, to; - int ret; - sector_t *pages_to_mark = NULL; - struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); - - dprintk("%s enter, %u@%lld\n", __func__, count, pos); - print_page(page); - /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ - if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { - dprintk("%s Can't handle blocksize %llu\n", __func__, - (u64)bl->bl_blocksize); - put_lseg(fsdata->lseg); - fsdata->lseg = NULL; - return 0; - } - if (PageMappedToDisk(page)) { - /* Basically, this is a flag that says we have - * successfully called write_begin already on this page. - */ - /* NOTE - there are cache consistency issues here. - * For example, what if the layout is recalled, then regained? - * If the file is closed and reopened, will the page flags - * be reset? If not, we'll have to use layout info instead of - * the page flag. - */ - return 0; - } - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + count; - ret = init_page_for_write(bl, page, from, to, &pages_to_mark); - if (ret) { - dprintk("%s init page failed with %i", __func__, ret); - /* Revert back to plain NFS and just continue on with - * write. This assumes there is no request attached, which - * should be true if we get here. - */ - BUG_ON(PagePrivate(page)); - put_lseg(fsdata->lseg); - fsdata->lseg = NULL; - kfree(pages_to_mark); - ret = 0; - } else { - fsdata->private = pages_to_mark; - } - return ret; -} - -/* CAREFUL - what happens if copied < count??? */ -static int -bl_write_end(struct inode *inode, struct page *page, loff_t pos, - unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) -{ - dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); - print_page(page); - if (lseg) - SetPageUptodate(page); - return 0; -} - -/* Return any memory allocated to fsdata->private, and take advantage - * of no page locks to mark pages noted in write_begin as needing - * initialization. - */ -static void -bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) -{ - struct page *page; - pgoff_t index; - sector_t *pos; - struct address_space *mapping = filp->f_mapping; - struct pnfs_fsdata *fake_data; - struct pnfs_layout_segment *lseg; - - if (!fsdata) - return; - lseg = fsdata->lseg; - if (!lseg) - return; - pos = fsdata->private; - if (!pos) - return; - dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); - for (; *pos != ~0; pos++) { - index = *pos >> (PAGE_CACHE_SHIFT - 9); - /* XXX How do we properly deal with failures here??? */ - page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) { - printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); - continue; - } - dprintk("%s: Examining block page\n", __func__); - print_page(page); - if (!PageMappedToDisk(page)) { - /* XXX How do we properly deal with failures here??? */ - dprintk("%s Marking block page\n", __func__); - init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, - NULL); - print_page(page); - fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); - if (!fake_data) { - printk(KERN_ERR "%s BUG BUG BUG NoMem\n", - __func__); - unlock_page(page); - continue; - } - get_lseg(lseg); - fake_data->lseg = lseg; - fake_data->bypass_eof = 1; - mapping->a_ops->write_end(filp, mapping, - index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE, - page, fake_data); - /* Note fake_data is freed by nfs_write_end */ - } else - unlock_page(page); - } - kfree(fsdata->private); - fsdata->private = NULL; -} - static const struct nfs_pageio_ops bl_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, .pg_test = pnfs_generic_pg_test, .pg_doio = nfs_generic_pg_readpages, }; static const struct nfs_pageio_ops bl_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, .pg_test = pnfs_generic_pg_test, .pg_doio = nfs_generic_pg_writepages, }; @@ -1054,9 +1032,6 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .name = "LAYOUT_BLOCK_VOLUME", .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, - .write_begin = bl_write_begin, - .write_end = bl_write_end, - .write_end_cleanup = bl_write_end_cleanup, .alloc_layout_hdr = bl_alloc_layout_hdr, .free_layout_hdr = bl_free_layout_hdr, .alloc_lseg = bl_alloc_lseg, @@ -1083,8 +1058,7 @@ static int __init nfs4blocklayout_init(void) static void __exit nfs4blocklayout_exit(void) { - dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", - __func__); + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", __func__); pnfs_unregister_layoutdriver(&blocklayout_type); bl_pipe_exit(); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1768762..2f093ed 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -384,15 +384,12 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; int once_thru = 0; - struct pnfs_layout_segment *lseg; dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); - lseg = pnfs_update_layout(mapping->host, - nfs_file_open_context(file), - pos, len, IOMODE_RW, GFP_NOFS); + start: /* * Prevent starvation issues if someone is doing a consistency @@ -412,9 +409,6 @@ start: if (ret) { unlock_page(page); page_cache_release(page); - *pagep = NULL; - *fsdata = NULL; - goto out; } else if (!once_thru && nfs_want_read_modify_write(file, page, pos, len)) { once_thru = 1; @@ -423,12 +417,6 @@ start: if (!ret) goto start; } - ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); - out: - if (ret) { - put_lseg(lseg); - *fsdata = NULL; - } return ret; } @@ -438,7 +426,6 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, { unsigned offset = pos & (PAGE_CACHE_SIZE - 1); int status; - struct pnfs_layout_segment *lseg; dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, @@ -465,17 +452,10 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, zero_user_segment(page, pglen, PAGE_CACHE_SIZE); } - lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); - status = pnfs_write_end(file, page, pos, len, copied, lseg); - if (status) - goto out; - status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + status = nfs_updatepage(file, page, offset, copied); -out: unlock_page(page); page_cache_release(page); - pnfs_write_end_cleanup(file, fsdata); - put_lseg(lseg); if (status < 0) return status; @@ -597,7 +577,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_LOCKED; if (nfs_flush_incompatible(filp, page) == 0 && - nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0) + nfs_updatepage(filp, page, 0, pagelen) == 0) goto out; ret = VM_FAULT_SIGBUS; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 42979e5..1fdc8f7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1223,41 +1223,6 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, } /* - * This gives the layout driver an opportunity to read in page "around" - * the data to be written. It returns 0 on success, otherwise an error code - * which will either be passed up to user, or ignored if - * some previous part of write succeeded. - * Note the range [pos, pos+len-1] is entirely within the page. - */ -int _pnfs_write_begin(struct inode *inode, struct page *page, - loff_t pos, unsigned len, - struct pnfs_layout_segment *lseg, - struct pnfs_fsdata **fsdata) -{ - struct pnfs_fsdata *data; - int status = 0; - - dprintk("--> %s: pos=%llu len=%u\n", - __func__, (unsigned long long)pos, len); - data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); - if (!data) { - status = -ENOMEM; - goto out; - } - data->lseg = lseg; /* refcount passed into data to be managed there */ - status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin( - lseg, page, pos, len, data); - if (status) { - kfree(data); - data = NULL; - } -out: - *fsdata = data; - dprintk("<-- %s: status=%d\n", __func__, status); - return status; -} - -/* * Called by non rpc-based layout drivers */ int @@ -1373,12 +1338,6 @@ void pnfs_cleanup_layoutcommit(struct inode *inode, data); } -void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) -{ - /* lseg refcounting handled directly in nfs_write_end */ - kfree(fsdata); -} - /* * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 6f7fa9f..a0c856c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -54,12 +54,6 @@ enum pnfs_try_status { PNFS_NOT_ATTEMPTED = 1, }; -struct pnfs_fsdata { - struct pnfs_layout_segment *lseg; - int bypass_eof; - void *private; -}; - #ifdef CONFIG_NFS_V4_1 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" @@ -113,14 +107,6 @@ struct pnfs_layoutdriver_type { */ enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); - int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, - loff_t pos, unsigned count, - struct pnfs_fsdata *fsdata); - int (*write_end)(struct inode *inode, struct page *page, loff_t pos, - unsigned count, unsigned copied, - struct pnfs_layout_segment *lseg); - void (*write_end_cleanup)(struct file *filp, - struct pnfs_fsdata *fsdata); void (*free_deviceid_node) (struct nfs4_deviceid_node *); @@ -196,7 +182,6 @@ enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); -void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -208,10 +193,6 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, struct nfs4_state *open_state); -int _pnfs_write_begin(struct inode *inode, struct page *page, - loff_t pos, unsigned len, - struct pnfs_layout_segment *lseg, - struct pnfs_fsdata **fsdata); int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range); @@ -329,13 +310,6 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req) put_lseg(req->wb_commit_lseg); } -static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, - struct pnfs_fsdata *fsdata) -{ - return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || - !fsdata->bypass_eof; -} - /* Should the pNFS client commit and return the layout upon a setattr */ static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) @@ -346,49 +320,6 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) PNFS_LAYOUTRET_ON_SETATTR; } -static inline int pnfs_write_begin(struct file *filp, struct page *page, - loff_t pos, unsigned len, - struct pnfs_layout_segment *lseg, - void **fsdata) -{ - struct inode *inode = filp->f_dentry->d_inode; - struct nfs_server *nfss = NFS_SERVER(inode); - int status = 0; - - *fsdata = lseg; - if (lseg && nfss->pnfs_curr_ld->write_begin) - status = _pnfs_write_begin(inode, page, pos, len, lseg, - (struct pnfs_fsdata **) fsdata); - return status; -} - -/* CAREFUL - what happens if copied < len??? */ -static inline int pnfs_write_end(struct file *filp, struct page *page, - loff_t pos, unsigned len, unsigned copied, - struct pnfs_layout_segment *lseg) -{ - struct inode *inode = filp->f_dentry->d_inode; - struct nfs_server *nfss = NFS_SERVER(inode); - - if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end) - return nfss->pnfs_curr_ld->write_end(inode, page, pos, len, - copied, lseg); - else - return 0; -} - -static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) -{ - struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); - - if (fsdata && nfss->pnfs_curr_ld) { - if (nfss->pnfs_curr_ld->write_end_cleanup) - nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata); - if (nfss->pnfs_curr_ld->write_begin) - pnfs_free_fsdata(fsdata); - } -} - static inline int pnfs_return_layout(struct inode *ino) { struct nfs_inode *nfsi = NFS_I(ino); @@ -400,19 +331,6 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } -static inline struct pnfs_layout_segment * -nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) -{ - if (fsdata) { - struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); - - if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin) - return ((struct pnfs_fsdata *) fsdata)->lseg; - return (struct pnfs_layout_segment *)fsdata; - } - return NULL; -} - #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -433,12 +351,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) { } -static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, - struct pnfs_fsdata *fsdata) -{ - return 1; -} - static inline enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *data, const struct rpc_call_ops *call_ops) @@ -458,26 +370,6 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } -static inline int pnfs_write_begin(struct file *filp, struct page *page, - loff_t pos, unsigned len, - struct pnfs_layout_segment *lseg, - void **fsdata) -{ - *fsdata = NULL; - return 0; -} - -static inline int pnfs_write_end(struct file *filp, struct page *page, - loff_t pos, unsigned len, unsigned copied, - struct pnfs_layout_segment *lseg) -{ - return 0; -} - -static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) -{ -} - static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) { @@ -554,13 +446,6 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl) { } - -static inline struct pnfs_layout_segment * -nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) -{ - return NULL; -} - #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1185262..574ec0e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -673,9 +673,7 @@ out: } static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count, - struct pnfs_layout_segment *lseg, void *fsdata) - + unsigned int offset, unsigned int count) { struct nfs_page *req; @@ -683,8 +681,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, if (IS_ERR(req)) return PTR_ERR(req); /* Update file length */ - if (pnfs_grow_ok(lseg, fsdata)) - nfs_grow_file(page, offset, count); + nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); nfs_mark_request_dirty(req); nfs_clear_page_tag_locked(req); @@ -737,8 +734,7 @@ static int nfs_write_pageuptodate(struct page *page, struct inode *inode) * things with a page scheduled for an RPC call (e.g. invalidate it). */ int nfs_updatepage(struct file *file, struct page *page, - unsigned int offset, unsigned int count, - struct pnfs_layout_segment *lseg, void *fsdata) + unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = page->mapping->host; @@ -763,7 +759,7 @@ int nfs_updatepage(struct file *file, struct page *page, offset = 0; } - status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) nfs_set_pageerror(page); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index e459379..fcc7f41 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -510,8 +510,7 @@ extern int nfs_congestion_kb; extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); -extern int nfs_updatepage(struct file *, struct page *, unsigned int, - unsigned int, struct pnfs_layout_segment *, void *); +extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); /* -- 1.7.4.1 -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html