RE: [PATCH-v2 2/3] NFS41: send real write size in layoutget

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> -----Original Message-----
> From: Boaz Harrosh [mailto:bharrosh@xxxxxxxxxxx]
> Sent: Tuesday, August 14, 2012 7:54 AM
> To: Peng Tao
> Cc: Trond.Myklebust@xxxxxxxxxx; linux-nfs@xxxxxxxxxxxxxxx; Peng, Tao
> Subject: Re: [PATCH-v2 2/3] NFS41: send real write size in layoutget
> 
> On 08/13/2012 05:39 PM, Peng Tao wrote:
> 
> > For buffer write, use policy based mechanism to determine layoutget size.
> > Currently files use whole file layout, objects use offset-to-isize, and
> > blocks search next hole in inode mapping and use offset-to-hole.
> >
> > For direct write, just use dreq->bytes_left.
> >
> > Signed-off-by: Peng Tao <tao.peng@xxxxxxx>
> > ---
> >  fs/nfs/blocklayout/blocklayout.c |    1 +
> >  fs/nfs/direct.c                  |    7 +++++
> >  fs/nfs/internal.h                |    1 +
> >  fs/nfs/nfs4filelayout.c          |    1 +
> >  fs/nfs/objlayout/objio_osd.c     |    3 +-
> >  fs/nfs/pnfs.c                    |   51 +++++++++++++++++++++++++++++++++++++-
> >  fs/nfs/pnfs.h                    |   13 +++++++++
> >  7 files changed, 75 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> > index 1093968..c4215cf 100644
> > --- a/fs/nfs/blocklayout/blocklayout.c
> > +++ b/fs/nfs/blocklayout/blocklayout.c
> > @@ -1240,6 +1240,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = {
> >  static struct pnfs_layoutdriver_type blocklayout_type = {
> >  	.id				= LAYOUT_BLOCK_VOLUME,
> >  	.name				= "LAYOUT_BLOCK_VOLUME",
> > +	.flags				= PNFS_LAYOUTGET_SEARCH_HOLE,
> >  	.read_pagelist			= bl_read_pagelist,
> >  	.write_pagelist			= bl_write_pagelist,
> >  	.alloc_layout_hdr		= bl_alloc_layout_hdr,
> > diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> > index c39f775..c1899dd 100644
> > --- a/fs/nfs/direct.c
> > +++ b/fs/nfs/direct.c
> > @@ -46,6 +46,7 @@
> >  #include <linux/kref.h>
> >  #include <linux/slab.h>
> >  #include <linux/task_io_accounting_ops.h>
> > +#include <linux/module.h>
> >
> >  #include <linux/nfs_fs.h>
> >  #include <linux/nfs_page.h>
> > @@ -191,6 +192,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
> >  	kref_put(&dreq->kref, nfs_direct_req_free);
> >  }
> >
> > +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
> > +{
> > +	return dreq->bytes_left;
> > +}
> > +EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
> > +
> >  /*
> >   * Collects and returns the final error value/byte-count.
> >   */
> > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > index 31fdb03..e68d329 100644
> > --- a/fs/nfs/internal.h
> > +++ b/fs/nfs/internal.h
> > @@ -464,6 +464,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
> >  {
> >  	inode_dio_wait(inode);
> >  }
> > +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
> >
> >  /* nfs4proc.c */
> >  extern void __nfs4_read_done_cb(struct nfs_read_data *);
> > diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> > index 53f94d9..f81edd7 100644
> > --- a/fs/nfs/nfs4filelayout.c
> > +++ b/fs/nfs/nfs4filelayout.c
> > @@ -1289,6 +1289,7 @@ filelayout_get_ds_info(struct inode *inode)
> >  static struct pnfs_layoutdriver_type filelayout_type = {
> >  	.id			= LAYOUT_NFSV4_1_FILES,
> >  	.name			= "LAYOUT_NFSV4_1_FILES",
> > +	.flags			= PNFS_LAYOUTGET_ALL_FILE,
> >  	.owner			= THIS_MODULE,
> >  	.alloc_layout_hdr	= filelayout_alloc_layout_hdr,
> >  	.free_layout_hdr	= filelayout_free_layout_hdr,
> > diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> > index ea6d111..e487fb8 100644
> > --- a/fs/nfs/objlayout/objio_osd.c
> > +++ b/fs/nfs/objlayout/objio_osd.c
> > @@ -638,7 +638,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
> >  	.id = LAYOUT_OSD2_OBJECTS,
> >  	.name = "LAYOUT_OSD2_OBJECTS",
> >  	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
> > -				   PNFS_LAYOUTRET_ON_ERROR,
> > +				   PNFS_LAYOUTRET_ON_ERROR |
> > +				   PNFS_LAYOUTGET_ISIZE,
> >
> >  	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
> >  	.free_layout_hdr         = objlayout_free_layout_hdr,
> > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> > index 2e00fea..d1da23a 100644
> > --- a/fs/nfs/pnfs.c
> > +++ b/fs/nfs/pnfs.c
> > @@ -29,6 +29,7 @@
> >
> >  #include <linux/nfs_fs.h>
> >  #include <linux/nfs_page.h>
> > +#include <linux/pagevec.h>
> >  #include <linux/module.h>
> >  #include "internal.h"
> >  #include "pnfs.h"
> > @@ -1172,19 +1173,67 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct
> nfs_page *r
> >  }
> >  EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
> >
> > +/*
> > + * Return the number of contiguous bytes for a given inode
> > + * starting at page frame idx.
> > + */
> > +static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
> > +{
> > +	struct address_space *mapping = inode->i_mapping;
> > +	pgoff_t end;
> > +
> > +	/* Optimize common case that writes from 0 to end of file */
> > +	end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
> > +	if (end != NFS_I(inode)->npages) {
> > +		rcu_read_lock();
> > +		end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
> > +		rcu_read_unlock();
> > +	}
> > +
> > +	if (!end)
> > +		return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
> > +	else
> > +		return (end - idx) << PAGE_CACHE_SHIFT;
> > +}
> > +
> >  void
> >  pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
> >  {
> > +	u64 wb_size;
> > +	unsigned policy = NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->flags &
> > +						PNFS_LAYOUTGET_POLICY_MASK;
> > +
> >  	BUG_ON(pgio->pg_lseg != NULL);
> >
> >  	if (req->wb_offset != req->wb_pgbase) {
> >  		nfs_pageio_reset_write_mds(pgio);
> >  		return;
> >  	}
> > +
> > +	if (pgio->pg_dreq == NULL) {
> > +		switch(policy) {
> > +		case PNFS_LAYOUTGET_ISIZE:
> > +			wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
> > +			break;
> > +		case PNFS_LAYOUTGET_SEARCH_HOLE:
> > +			wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index);
> > +			break;
> > +		case PNFS_LAYOUTGET_ALL_FILE:
> > +			wb_size = NFS4_MAX_UINT64;
> > +			break;
> > +		default:
> > +			WARN_ONCE(1, "invalid layoutget policy %u", policy);
> > +			wb_size = PAGE_CACHE_SIZE;
> > +			break;
> > +		}
> > +	} else {
> > +		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
> > +	}
> > +
> >  	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
> >  					   req->wb_context,
> >  					   req_offset(req),
> > -					   req->wb_bytes,
> > +					   wb_size,
> >  					   IOMODE_RW,
> >  					   GFP_NOFS);
> >  	/* If no lseg, fall back to write through mds */
> > diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> > index 745aa1b..ce86894 100644
> > --- a/fs/nfs/pnfs.h
> > +++ b/fs/nfs/pnfs.h
> > @@ -71,8 +71,21 @@ enum layoutdriver_policy_flags {
> >  	/* Should the pNFS client commit and return the layout upon a setattr */
> >  	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
> >  	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
> > +
> > +	/* Layoutget(write) length policy:
> > +	 * PNFS_LAYOUTGET_ISIZE, use offset-to-isize
> > +	 * PNFS_LAYOUTGET_SEARCH_HOLE, use offset-to-hole
> > +	 * PNFS_LAYOUTGET_ALL_FILE, use NFS4_MAX_UINT64
> > +	 */
> > +	PNFS_LAYOUTGET_ISIZE		= 1 << 2,
> > +	PNFS_LAYOUTGET_SEARCH_HOLE	= 1 << 3,
> > +	PNFS_LAYOUTGET_ALL_FILE		= 1 << 4,
> >  };
> >
> > +#define PNFS_LAYOUTGET_POLICY_MASK	(PNFS_LAYOUTGET_ISIZE |		\
> > +					 PNFS_LAYOUTGET_SEARCH_HOLE |	\
> > +					 PNFS_LAYOUTGET_ALL_FILE)
> > +
> >  struct nfs4_deviceid_node;
> >
> >  /* Per-layout driver specific registration structure */
> 
> 
> All 3 looks very good now (fast scan through). However they need heavy
> testing. I will only get to them early next week.
> 
> How do they perform for you? please report your finding with the EMC
> server it is interesting to know.
> 
Without optimization in server, the patchset makes huge difference for sequential IO. With proper server optimization, I still got noticeable performance improvement. Block layout server tends not to pre-allocate segments very aggressively. And sending real IO size helps server to make better decisions.

Thanks,
Tao

��.n��������+%������w��{.n�����{��w���jg��������ݢj����G�������j:+v���w�m������w�������h�����٥



[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux